diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml new file mode 100644 index 000000000000..1bf5e060bbaf --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml @@ -0,0 +1,14934 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI1zEvSrUfniyIPA5Nficsug_rlEaVAGxHNYFMuvmYifqc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI1oq7eVBWwzey19A8uIqsDkLEvAC8u4jm_3WdeHmyqNY8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI1RipBYLITmgLj7nJiXhyI57h-QdswAlXmvZRQ-t7Yij4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33280 + LdsInitCVgprs: false + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI1iIhyCvIYW4EQ_x86i0IZ86zDKGdESkaJ9YYcm0hSGzI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 57600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 57600 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI1eZeOxd9MrHLQR2GNA65EfarGb_PDYi0bDvnnTj3iymw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI1fok8NkYtMokxqZZMiJ_iGKUDQvUnmdxY1flvKGmyHyE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI4THos_lFJzWi_Dnch9qAlPGNCCGbpGTJT_pt2NL4ZB8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA64_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA64_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI1pEJy9yEKZBXA9EcPO5rjF-8vFHW5-X8UkNyH2oVn994= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1536_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29440 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1536_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI1S6e2ViSr3lOZh36puDEQ4ogQb2Kj5RPE7WzL1lq3F0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI1C6jTq6qNnho1ENixE4x2q_23xMvUAX0Q3bKaL_ah5XQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21248 + LdsInitCVgprs: false + LdsNumBytes: 21248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21248 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI1158FxB1URc5cpKzdBy5PKmwBlaNnYvU5pNK2aJTi3FY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29440 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI163lHbClsVTGPnNSzP8QvKwmggFBp2OvBLf9IoiXlJZAY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16cyaq9MxU7sNbLEazHdXIfoficQy4YOPE4TFqMztilgY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20992 + LdsInitCVgprs: false + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16O3R5PeDu0JjUKDdGyCc1CoE20Q-U4WqcAOA_B9UwqIo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI1LPyOUj4mpprhT6Es7dZqjCKeRoX1IDrnjJ3fYZl1HPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI1dsiiVyF-YL7QpBTxukjNXBE5_N2gPbhGhfp7ioSoYBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33280 + LdsInitCVgprs: false + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI1w-MN2Mm1J0St8q8s9xFauQAWxA0khy1Pd4WoSoekSlg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI1cgKWeOcB-B078ImMnsIo0ZS6zy_3JERNpCWjdkytEyw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16_rKHyj2ip3IiAlJ-oI4Mi3S3Jk5OCWj2H83MWBg-Vss= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20992 + LdsInitCVgprs: false + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16Qa_B98mEcapAL2iX2YgOcoJisssJyHDlM9u9CPAkaUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16EwXWjXKR9ATyJNfH3DyoYi3LQ4H0ugLmKOZi9zzkKhc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29056 + LdsInitCVgprs: false + LdsNumBytes: 29056 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI15LrCqHq2DyagcJd2X5qTNiDHWQnVHm_AqMqcFDlvmsU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA16_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA16_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI1RS5-E3dcQYltJiaIWglbCOB49DJT8EKGQr29y9ht3jk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI1WAM7dQoYboHh0SapFH0XK75zgHRfXzHLhlFGHRFgW8U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI1fkykvXwBDcTCeZQcsORsfStQSOevaJqa0Sia79tVqpE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21248 + LdsInitCVgprs: false + LdsNumBytes: 21248 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21248 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16WEwxgg8Y4FSUK3HPuX6yKvRQPHQ0p3bui3ZyCjYgs5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16PYalQdoVElqz6wbixR1EPrpAfE0UNhfwtrN4ZWmHKYs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16XYtEgd7zoKly5GXv2t2pAYLX-e76OQRyOB6Ygozeu0A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24960 + LdsInitCVgprs: false + LdsNumBytes: 24960 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI1JWVR-jh-aLg0rrdtghL_Hme5Pul5fnhSrwkCOBuPtIU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB768_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 39680 + LdsInitCVgprs: false + LdsNumBytes: 39680 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 39680 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB768_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI1PwJ42uBJtJDXBA5J9eUpFXnSBdG873yhqH9x9xHr9aM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35584 + LdsInitCVgprs: false + LdsNumBytes: 35584 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI1-ckiACWa5_Hl9q0vruUn7fAZYeT9xW9mnDjkhP2c50s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 39680 + LdsInitCVgprs: false + LdsNumBytes: 39680 + LdsNumElementsAlignedA: 6400 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 39680 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI1MnL3tvF_jGDAY_I8anP7ysG0uEcoQof1urIS_BrpX_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35584 + LdsInitCVgprs: false + LdsNumBytes: 35584 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 67840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 67840 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI1arXBxIFzyAilDtP1DqyksxdPITOkluw9jmLU5u0Tt2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI1rXabrsjNwkup_JrxozMVR_3U4GdmTFKff0KArypEYAA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI1VT_L439A1JYsr4mdupck3knPO5y9YC1PTbi05SL_Cz0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16zQIQS-CrV5B-hJqIIcfIrW4-99zhrrQJDstr_sFS9t4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16yzgS8-MSN2UCxzxDcSy8jzQVwvB-olaZ82o1J7M6llU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI12hFMSt6lMTI8oWW_7aZTwxjiukY6LB_x-MEbbxd5p3M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI163QE_QBcfCjhjm6YvZW1aBPJMqF6E2nryKoAjsugzB0U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16CqP0rVSFO3WDeB13qT6LUGERdrWprMFCekioRFqiTqQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI1h8DSl_KtVz4vyL0LXehy8GQa575LmnT-GS8Sm349l8c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 256 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 70912 + LdsInitCVgprs: false + LdsNumBytes: 70912 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 70912 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI1ABd-H42vx6NZOkifL5ShxVhgON-oxRsJ0GyaVxfUhIg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 256 + LSPB: 16 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 70912 + LdsInitCVgprs: false + LdsNumBytes: 70912 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 135424 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 70912 + LdsOffsetMetadata_Blk: 135424 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI160Nuqi97WrEd5kWYgnYYkOYOWZ8I-jc1R0TyAYU3gxQk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 256 + LSPB: 256 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16y2XC2HjmywvRNMRzvdc4xwYo7V4GZ1mS8mdECpGkL28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI162YLFRDLjA2hwymDeM7kI4wEg1_Ytlga74L2JT4VF0MM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 64 + LSPB: 256 + LVCA: 4 + LVCB: 1 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI1651AQfSOaqL9x7qKXZbPBT7ZWUW0NRJycR149B4OSQTc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 256 + LSPB: 128 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI1674YMvMGkXuXL1Cf1gzLE5KEGPC_Zl12HLRkso0LE7_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 64 + LSPA: 256 + LSPB: 64 + LVCA: 1 + LVCB: 4 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI1vOk7fZIE6d2Crn4j8R3f-gr9gYb6d1i7QRitxgT49e8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 256 + LSPB: 256 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16xLaYt4XA53nJMoN4GRDxGCoYkMto7pvV9s9Hmk9M424A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 000000000000..4e16df01f0e6 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,14934 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI1zEvSrUfniyIPA5Nficsug_rlEaVAGxHNYFMuvmYifqc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI1oq7eVBWwzey19A8uIqsDkLEvAC8u4jm_3WdeHmyqNY8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI1RipBYLITmgLj7nJiXhyI57h-QdswAlXmvZRQ-t7Yij4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33280 + LdsInitCVgprs: false + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI1iIhyCvIYW4EQ_x86i0IZ86zDKGdESkaJ9YYcm0hSGzI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 57600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 57600 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI1eZeOxd9MrHLQR2GNA65EfarGb_PDYi0bDvnnTj3iymw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI1fok8NkYtMokxqZZMiJ_iGKUDQvUnmdxY1flvKGmyHyE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI4THos_lFJzWi_Dnch9qAlPGNCCGbpGTJT_pt2NL4ZB8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA64_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA64_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI1pEJy9yEKZBXA9EcPO5rjF-8vFHW5-X8UkNyH2oVn994= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1536_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29440 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1536_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI1S6e2ViSr3lOZh36puDEQ4ogQb2Kj5RPE7WzL1lq3F0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI1C6jTq6qNnho1ENixE4x2q_23xMvUAX0Q3bKaL_ah5XQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21248 + LdsInitCVgprs: false + LdsNumBytes: 21248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21248 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI1158FxB1URc5cpKzdBy5PKmwBlaNnYvU5pNK2aJTi3FY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29440 + LdsInitCVgprs: false + LdsNumBytes: 29440 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29440 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI163lHbClsVTGPnNSzP8QvKwmggFBp2OvBLf9IoiXlJZAY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16cyaq9MxU7sNbLEazHdXIfoficQy4YOPE4TFqMztilgY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20992 + LdsInitCVgprs: false + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16O3R5PeDu0JjUKDdGyCc1CoE20Q-U4WqcAOA_B9UwqIo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI1LPyOUj4mpprhT6Es7dZqjCKeRoX1IDrnjJ3fYZl1HPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI1dsiiVyF-YL7QpBTxukjNXBE5_N2gPbhGhfp7ioSoYBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33280 + LdsInitCVgprs: false + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI1w-MN2Mm1J0St8q8s9xFauQAWxA0khy1Pd4WoSoekSlg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI1cgKWeOcB-B078ImMnsIo0ZS6zy_3JERNpCWjdkytEyw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16_rKHyj2ip3IiAlJ-oI4Mi3S3Jk5OCWj2H83MWBg-Vss= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20992 + LdsInitCVgprs: false + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16Qa_B98mEcapAL2iX2YgOcoJisssJyHDlM9u9CPAkaUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16EwXWjXKR9ATyJNfH3DyoYi3LQ4H0ugLmKOZi9zzkKhc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29056 + LdsInitCVgprs: false + LdsNumBytes: 29056 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI15LrCqHq2DyagcJd2X5qTNiDHWQnVHm_AqMqcFDlvmsU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA16_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA16_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI1RS5-E3dcQYltJiaIWglbCOB49DJT8EKGQr29y9ht3jk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29184 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI1WAM7dQoYboHh0SapFH0XK75zgHRfXzHLhlFGHRFgW8U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI1fkykvXwBDcTCeZQcsORsfStQSOevaJqa0Sia79tVqpE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21248 + LdsInitCVgprs: false + LdsNumBytes: 21248 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21248 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16WEwxgg8Y4FSUK3HPuX6yKvRQPHQ0p3bui3ZyCjYgs5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16PYalQdoVElqz6wbixR1EPrpAfE0UNhfwtrN4ZWmHKYs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29184 + LdsInitCVgprs: false + LdsNumBytes: 29184 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16XYtEgd7zoKly5GXv2t2pAYLX-e76OQRyOB6Ygozeu0A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24960 + LdsInitCVgprs: false + LdsNumBytes: 24960 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI1JWVR-jh-aLg0rrdtghL_Hme5Pul5fnhSrwkCOBuPtIU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB768_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 39680 + LdsInitCVgprs: false + LdsNumBytes: 39680 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 39680 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB768_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI1PwJ42uBJtJDXBA5J9eUpFXnSBdG873yhqH9x9xHr9aM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35584 + LdsInitCVgprs: false + LdsNumBytes: 35584 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI1-ckiACWa5_Hl9q0vruUn7fAZYeT9xW9mnDjkhP2c50s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 39680 + LdsInitCVgprs: false + LdsNumBytes: 39680 + LdsNumElementsAlignedA: 6400 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 39680 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI1MnL3tvF_jGDAY_I8anP7ysG0uEcoQof1urIS_BrpX_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35584 + LdsInitCVgprs: false + LdsNumBytes: 35584 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 67840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 67840 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI1arXBxIFzyAilDtP1DqyksxdPITOkluw9jmLU5u0Tt2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI1rXabrsjNwkup_JrxozMVR_3U4GdmTFKff0KArypEYAA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI1VT_L439A1JYsr4mdupck3knPO5y9YC1PTbi05SL_Cz0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16zQIQS-CrV5B-hJqIIcfIrW4-99zhrrQJDstr_sFS9t4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16yzgS8-MSN2UCxzxDcSy8jzQVwvB-olaZ82o1J7M6llU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI12hFMSt6lMTI8oWW_7aZTwxjiukY6LB_x-MEbbxd5p3M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI163QE_QBcfCjhjm6YvZW1aBPJMqF6E2nryKoAjsugzB0U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25344 + LdsInitCVgprs: false + LdsNumBytes: 25344 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25344 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16CqP0rVSFO3WDeB13qT6LUGERdrWprMFCekioRFqiTqQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 16896 + LdsInitCVgprs: false + LdsNumBytes: 16896 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI1h8DSl_KtVz4vyL0LXehy8GQa575LmnT-GS8Sm349l8c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 256 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 70912 + LdsInitCVgprs: false + LdsNumBytes: 70912 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 70912 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI1ABd-H42vx6NZOkifL5ShxVhgON-oxRsJ0GyaVxfUhIg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 256 + LSPB: 16 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 70912 + LdsInitCVgprs: false + LdsNumBytes: 70912 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 135424 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 70912 + LdsOffsetMetadata_Blk: 135424 + LdsPadA: 16 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI160Nuqi97WrEd5kWYgnYYkOYOWZ8I-jc1R0TyAYU3gxQk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 256 + LSPB: 256 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16y2XC2HjmywvRNMRzvdc4xwYo7V4GZ1mS8mdECpGkL28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI162YLFRDLjA2hwymDeM7kI4wEg1_Ytlga74L2JT4VF0MM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 64 + LSPB: 256 + LVCA: 4 + LVCB: 1 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI1651AQfSOaqL9x7qKXZbPBT7ZWUW0NRJycR149B4OSQTc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 256 + LSPB: 128 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI1674YMvMGkXuXL1Cf1gzLE5KEGPC_Zl12HLRkso0LE7_M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 64 + LSPA: 256 + LSPB: 64 + LVCA: 1 + LVCB: 4 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI1vOk7fZIE6d2Crn4j8R3f-gr9gYb6d1i7QRitxgT49e8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 256 + LSPB: 256 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16xLaYt4XA53nJMoN4GRDxGCoYkMto7pvV9s9Hmk9M424A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml new file mode 100644 index 000000000000..91fcc796a06c --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml @@ -0,0 +1,27357 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MIk_RqFM_AwoflFBuOWbvk-8fTqjDZ1FddXgJjbcy-G0k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125952 + LdsInitCVgprs: false + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MISPq2JndWaX4OL12RUN6ulsn7QcNRohtTIZC1Edgdyws= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MIKVXPdd7d_qWsOE3R4Nn3WRva2YN_nzMUEkSXMgI4hvI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI1T7zX-LAq1W4S0p0x8V1ahqv2mXqZFvICIzJD4wjn5vU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48128 + LdsInitCVgprs: false + LdsNumBytes: 48128 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48128 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI1C8Q9_g8YRB2HVCaci6ZpxWtz2F-KOjbPdcOSgM2OeEI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI1wcQqKyv34c9LbqUML817LBELS8jNDkpatWFuPGnAz0I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MIghxlKUHytHxV_rsO7Y3k-hKsNgsUv14Ji4M_f4zzLUg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50432 + LdsInitCVgprs: false + LdsNumBytes: 50432 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50432 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MIJBI5cmbVufum9IvszCzB765s6mkvxNBCOv5r57HrQ5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI195MLYeeCjfNjgEj-ik95KMPTRnsuVyYIPGD8iI7U3FA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI1enbINkflafBFO3_Pebg7jg-XrgUNrjPKEtJ1FG_ArW4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI1AXbZC25QBPC0PuokhYA5b8WyOR4HvuGr9lZsTR0HMWk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 57600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 57600 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MIYn1IItDWJZY2hmD9-0eW5TADd3akO6yu4iNVSScCFLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 86272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 86272 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI15qZVVldYFEVxv6DoRCJq1o-8jbrdV19RqRcT-bcTp5g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 86272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 86272 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI1lHcr20MZlAXhwNOR6Gevj4wJaRc4B-tum0bYx49prt8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI1XvrvtOqqq8hKEVOCgfEFmALGzVphawbwpkAxG7nqoi8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MIYXJbKl66tz5Yz16uScZ-3QcnIi8M-PaVMoZpl5IQJR0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MIWMlYxh0gBBwfi1wvTDHJHkSEVXIuF12-m4Tq-srJGFk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MIU-MW4z63wAohwJUDz0C8OcgBozd0DUJEBLSSdararIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MIZLtCnUAcZQRb2Sm7BFv7A3a2vBOCNqxyJPub1dc8Z8U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MInMSU2Gh3mMVemcGZ3WqUWu2l4NG8N5rOCZZl_tZSxr4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI1VCOEzWL6LPvmD0HkYK2IQmKeQ33MFQ6q3DD1gUWXPWI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI1bvrvP40X4hFkSo1KwPg0VEiYDXT6P8LD2PVpHtPr4t8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI1KCNveOe_jYnymu00u65Iq4hQQmFEfEECKI4hYCSw86A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22016 + LdsInitCVgprs: false + LdsNumBytes: 22016 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22016 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI11tDVdUBSEo3UIS5TLyJqHVgQw3dto3agLBzaHthkf5A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46336 + LdsInitCVgprs: false + LdsNumBytes: 46336 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46336 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI1uDCZdlP32w2aUIZQgU6UFaItvoNYlwEGmbTkXc1IEtc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48384 + LdsInitCVgprs: false + LdsNumBytes: 48384 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48384 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 7] + MIWaveTileA: 3 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 224 + MacroTileA: 96 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 3 + NumLoadsB: 7 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 7 + ThreadTileA: 12 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI1tIdq_8HHc1AUHlfryN0_49Q8EAD9oEnK2lMgmhIKC2E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI1o_9BqNra6zh_HmLvWFNQm4WYzN2W1bygUabZEe9Lzsg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI1KePfYaceZ29tiGigvOtcCFJNsP7pueMsjzyHKVb7w64= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16tOU8-XLYrNPdvvuRaPZblzPTCq0IYpmOqRVYwuw2zIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27904 + LdsInitCVgprs: false + LdsNumBytes: 27904 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27904 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16iQaL89dfDqFaUwHSFuqBVL1d7gxxRqiMEISWe48rBhk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16G697WmrCkkuSW0ygicOEsjSabf2uABvGDaKBNngK4oE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI1opMCZdZFzR31a1nXMExXx-HFGwt6FHBNysNBuXPo34o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI1GPhaxX2PStJ8QfX3S-AgQFNJD6o4d_c2qqsOdt_oaC4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44288 + LdsInitCVgprs: false + LdsNumBytes: 44288 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44288 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI1gqbADNhoP3bHba8oMSfqNYB-76ixvzWU8E58C_oXxBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI1-VJoMMTo08ubflQERsCoO0j_TzMOoFOBMG1VV9dqzpY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI15OARqHSqH8kbGBPru-B8g6Oad-Zo3FGsUS8SxkzZZYk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16xVRyCL8pcYWjhmBX3Sksf7_ECJZKzPUO0YM7WCFd9sM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23808 + LdsInitCVgprs: false + LdsNumBytes: 23808 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI162emMCIrLY9uAeJtwZl6HlANVDofIksu3xpgoJdG4L7I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16UNManzuffvMTmsgfWnWF6Bt5jjvtS0tBsdToDQ_bZvc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI1kHn9o1wGnb0Le03rKuL4bApCYLkFlSLtoU6MjkRibzc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI1NMWe8o4PG1BIiKP1ggEjZUpwkkJSDyvBBNMZ8tXiaR8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 1 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI1Q0rm-ZJ-B1Ep3tDRV7Y6v5xFppIw1pc1wB6W7qCB6eE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32000 + LdsInitCVgprs: false + LdsNumBytes: 32000 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32000 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI14_XY01zUqMSgVV3Z29zc0HvqY2RBU1XaMixpmKPTFvQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI1BV068w-wQamiyACMJrCQO6AzvkFtVVaL6YUOIybq_As= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI167ogjNu2arPxWNgqgKM8sDxj_QbeHe5G3biyDU4kITPE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 19712 + LdsInitCVgprs: false + LdsNumBytes: 19712 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19712 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI1606uPSrg8jSF_adB8-Q1EUFxmHwf5F9lLMz64s3UmyCk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16Jm6coo6HlXoYc23v3SkS10L-uM9lwa5bnHnY2Vs_oLo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MIv9D6qCD557UuGj0WYoB1KLme1Sh1mzoR8zvP-Ew1rDY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 13] + MIWaveTileA: 4 + MIWaveTileB: 13 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 208 + MacroTileA: 256 + MacroTileB: 208 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 8 + NumLoadsB: 13 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 13 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 13 + ThreadTileA: 16 + ThreadTileB: 13 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MIZTzgVMYbF8zAjkLbWPx_KguNjRPGDWzQphgfwu106mI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 28160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 11] + MIWaveTileA: 4 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 176 + MacroTileA: 256 + MacroTileB: 176 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 44 + NumLoadsA: 8 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 11 + ThreadTileA: 16 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MIYpXgEPF2KmPhp_HSItkaLdu8ZfTql6wuzRMctpM8bWs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121856 + LdsInitCVgprs: false + LdsNumBytes: 121856 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MIM6ky9Tq6VRezat1v-Vy4Qy4OEEUGyFJHwE-XvlIpRaY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI1ZaT28UTzCJBM6wCUWArJk9cu4afCdA6oGhULOQC7jwQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI1ckLt2U3Q4BSlwSpNR-cpd6uJ3sMr5OVPRWuKvEiSVtY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI1Cks6yoWu7mRHY5OU_L0wlyt_BkdBZ9B6KYLX0jMjEZM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35840 + LdsInitCVgprs: false + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI1LRieWm8obTv5BrugK6ZfaH9J4-zT3LrRBcx6GxwPdUk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 1280 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45312 + LdsInitCVgprs: false + LdsNumBytes: 45312 + LdsNumElementsAlignedA: 10496 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10496 + LdsOffsetB_Blk: 76032 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45312 + LdsOffsetMetadata_Blk: 76032 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI1zBVp9O_j-pDQmFJ0pruVZjPjyXjbaxIDNKuOKzE-hUk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 6400 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI1LmEbjkFjRe2EHvxZ5njVJSBpnw78dqd37eC05zoq8oY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 67840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 67840 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI_Cpsqq7IdCcI6m09ixnRrSz_HTRJbwY41LkpuZ6eaow= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI1tP0yLw5sOaqSqLw69T0rUISjIec4gtRcEBnLindXxd0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI1WlKSkQYWc45q9-9u3Sa8KR5G5YSwdMXXvYIooja5mzU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI1ZFz2Wqi9AGPfO9YZeT8hu0U-a6QRpBHZ1MNZhOuWfzs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16YTIDOkvWpHTvJMbXMK3Qn2TUcHpLDIz3L67zYqwLtxM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI1Hyiuad9E_LwogHYOQgHONBkR83eXTUE_-HwHw1z6c2M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62976 + LdsInitCVgprs: false + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI1_vkhGjNt0PNt_xV8VVOz2XzUCfwEcU8zljVlbKTICVg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16riuLixhIaRRQRAFH27-FM-5iTnYrSUKGh2HFdVNH4W8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16tRLsgniRf9NQ8cwUknJpPeoosk2I0euxrG_yXuRD6Zc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16zCjLhPMzebcvjlA5OUrujTBGN1OYulB3ktj1a570vLs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI1OuevmgTEfRQM80_r7hzorlLPNIZiSHCEk0LhjgcOAH8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60672 + LdsInitCVgprs: false + LdsNumBytes: 60672 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60672 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI117HfbCeMXp_75IhCNgyVsAtudVa47Y080duX4tlb_Cc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 54528 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI1w_TVfNrjE3mYvq96Bm1Jgkl-HCbXvzNQ6H6gr4PSnKc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16IX89BCLJtYzlNb-bQG51jUiesgW7gn9abBGKzaC4u7Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16yEC0PHvmD0zrKvRDTkhqsmiXq78VvD0Eg5fcAnocoic= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16_OtB8fTrZk3Pt0bE3bXfsWfkoWXEPa2ENPfe4eaiVRI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI1uaMQi1863lUOUemwTE99mttXxpTRkvqvmiwvtfx6Qx0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 80384 + LdsInitCVgprs: false + LdsNumBytes: 80384 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 80384 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI1_aftRJpk6u1tTs3-Hwls-Fo3A3dD889Os-hWuvfNQvw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71168 + LdsInitCVgprs: false + LdsNumBytes: 71168 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71168 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI18CjcSziK0PjBic2MAopWR-Ta_nHbDse6qNtds5pye7Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 256 + LSPB: 16 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71936 + LdsInitCVgprs: false + LdsNumBytes: 71936 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 135424 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71936 + LdsOffsetMetadata_Blk: 135424 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16-AdnHWdGaF-mVZCwCYUd2wq3b2fSNewc3-7rmRp2b3Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16pBqfsRIAkQBDjOQRcdakPvu1rZh51vJuBh1AVswNR5Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI169OaDNKWZnXMa1R_AlPEgQpVqAFVb54TBTn3PvUMxY7A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16aFeBRs_oObL14ukkJN1DauqQpjn3rpabfuRB4QWnh0o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16JRL6UviIw3ql2MWJlkjDhAPVUQuM-dfo_YP9V8INeCE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16a9rQBxUkyvDWlmJquyIGWqzMINaNpRD_aWL9QWIuzQ8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16Jyq1ax9kWbpfAk08jNUVf7z1XoXRVCVdpt-kBpa0fWY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16D0scrs9_FOYkVHUHqfaWIBeu8Rz72d3pOV3R8iV8H74= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI169Ao_eCjaajChoPx9i0y3MGqMyeJdbqU3dbjwB_UIKx4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI1DDaFTcewwV5Ybn8swfgQtfH13aPb98dLOnRrTIJYmSQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI1ifWE51xli667eSWHdf8X0Gqm1isofKMCZj08QiVl3U0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 1024 + LSPA: 128 + LSPB: 4 + LVCA: 2 + LVCB: 64 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI1U06udU6pkFuvB0exyiBHs-wOgMdXOgbcv4nENk4uw44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI1oDw0RAqdNEFhP7idqV0O87totk9ugWGoiXKFb7JRQVw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84992 + LdsInitCVgprs: false + LdsNumBytes: 84992 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84992 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16xqwn4mYe9PqjtRIapRznDsflXKMnaPO3v98RcshyCRoc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 000000000000..469bba7e28dc --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,27357 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MIk_RqFM_AwoflFBuOWbvk-8fTqjDZ1FddXgJjbcy-G0k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125952 + LdsInitCVgprs: false + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MISPq2JndWaX4OL12RUN6ulsn7QcNRohtTIZC1Edgdyws= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MIKVXPdd7d_qWsOE3R4Nn3WRva2YN_nzMUEkSXMgI4hvI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI1T7zX-LAq1W4S0p0x8V1ahqv2mXqZFvICIzJD4wjn5vU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48128 + LdsInitCVgprs: false + LdsNumBytes: 48128 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48128 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI1C8Q9_g8YRB2HVCaci6ZpxWtz2F-KOjbPdcOSgM2OeEI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI1wcQqKyv34c9LbqUML817LBELS8jNDkpatWFuPGnAz0I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MIghxlKUHytHxV_rsO7Y3k-hKsNgsUv14Ji4M_f4zzLUg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50432 + LdsInitCVgprs: false + LdsNumBytes: 50432 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50432 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MIJBI5cmbVufum9IvszCzB765s6mkvxNBCOv5r57HrQ5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI195MLYeeCjfNjgEj-ik95KMPTRnsuVyYIPGD8iI7U3FA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI1enbINkflafBFO3_Pebg7jg-XrgUNrjPKEtJ1FG_ArW4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI1AXbZC25QBPC0PuokhYA5b8WyOR4HvuGr9lZsTR0HMWk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 57600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 57600 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MIYn1IItDWJZY2hmD9-0eW5TADd3akO6yu4iNVSScCFLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 86272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 86272 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI15qZVVldYFEVxv6DoRCJq1o-8jbrdV19RqRcT-bcTp5g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 86272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 86272 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI1lHcr20MZlAXhwNOR6Gevj4wJaRc4B-tum0bYx49prt8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI1XvrvtOqqq8hKEVOCgfEFmALGzVphawbwpkAxG7nqoi8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 20736 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20736 + LdsOffsetB_Blk: 53504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 53504 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MIYXJbKl66tz5Yz16uScZ-3QcnIi8M-PaVMoZpl5IQJR0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MIWMlYxh0gBBwfi1wvTDHJHkSEVXIuF12-m4Tq-srJGFk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MIU-MW4z63wAohwJUDz0C8OcgBozd0DUJEBLSSdararIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MIZLtCnUAcZQRb2Sm7BFv7A3a2vBOCNqxyJPub1dc8Z8U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MInMSU2Gh3mMVemcGZ3WqUWu2l4NG8N5rOCZZl_tZSxr4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI1VCOEzWL6LPvmD0HkYK2IQmKeQ33MFQ6q3DD1gUWXPWI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI1bvrvP40X4hFkSo1KwPg0VEiYDXT6P8LD2PVpHtPr4t8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI1KCNveOe_jYnymu00u65Iq4hQQmFEfEECKI4hYCSw86A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22016 + LdsInitCVgprs: false + LdsNumBytes: 22016 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22016 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI11tDVdUBSEo3UIS5TLyJqHVgQw3dto3agLBzaHthkf5A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46336 + LdsInitCVgprs: false + LdsNumBytes: 46336 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46336 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI1uDCZdlP32w2aUIZQgU6UFaItvoNYlwEGmbTkXc1IEtc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48384 + LdsInitCVgprs: false + LdsNumBytes: 48384 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48384 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 7] + MIWaveTileA: 3 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 224 + MacroTileA: 96 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 3 + NumLoadsB: 7 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 7 + ThreadTileA: 12 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI1tIdq_8HHc1AUHlfryN0_49Q8EAD9oEnK2lMgmhIKC2E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI1o_9BqNra6zh_HmLvWFNQm4WYzN2W1bygUabZEe9Lzsg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 78080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 78080 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI1KePfYaceZ29tiGigvOtcCFJNsP7pueMsjzyHKVb7w64= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16tOU8-XLYrNPdvvuRaPZblzPTCq0IYpmOqRVYwuw2zIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27904 + LdsInitCVgprs: false + LdsNumBytes: 27904 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27904 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16iQaL89dfDqFaUwHSFuqBVL1d7gxxRqiMEISWe48rBhk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16G697WmrCkkuSW0ygicOEsjSabf2uABvGDaKBNngK4oE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 12544 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12544 + LdsOffsetB_Blk: 45312 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 45312 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI1opMCZdZFzR31a1nXMExXx-HFGwt6FHBNysNBuXPo34o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI1GPhaxX2PStJ8QfX3S-AgQFNJD6o4d_c2qqsOdt_oaC4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44288 + LdsInitCVgprs: false + LdsNumBytes: 44288 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44288 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI1gqbADNhoP3bHba8oMSfqNYB-76ixvzWU8E58C_oXxBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI1-VJoMMTo08ubflQERsCoO0j_TzMOoFOBMG1VV9dqzpY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI15OARqHSqH8kbGBPru-B8g6Oad-Zo3FGsUS8SxkzZZYk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16xVRyCL8pcYWjhmBX3Sksf7_ECJZKzPUO0YM7WCFd9sM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23808 + LdsInitCVgprs: false + LdsNumBytes: 23808 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI162emMCIrLY9uAeJtwZl6HlANVDofIksu3xpgoJdG4L7I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16UNManzuffvMTmsgfWnWF6Bt5jjvtS0tBsdToDQ_bZvc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI1kHn9o1wGnb0Le03rKuL4bApCYLkFlSLtoU6MjkRibzc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI1NMWe8o4PG1BIiKP1ggEjZUpwkkJSDyvBBNMZ8tXiaR8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 69888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 69888 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 1 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI1Q0rm-ZJ-B1Ep3tDRV7Y6v5xFppIw1pc1wB6W7qCB6eE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32000 + LdsInitCVgprs: false + LdsNumBytes: 32000 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32000 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI14_XY01zUqMSgVV3Z29zc0HvqY2RBU1XaMixpmKPTFvQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI1BV068w-wQamiyACMJrCQO6AzvkFtVVaL6YUOIybq_As= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI167ogjNu2arPxWNgqgKM8sDxj_QbeHe5G3biyDU4kITPE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 19712 + LdsInitCVgprs: false + LdsNumBytes: 19712 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 37120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19712 + LdsOffsetMetadata_Blk: 37120 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI1606uPSrg8jSF_adB8-Q1EUFxmHwf5F9lLMz64s3UmyCk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16Jm6coo6HlXoYc23v3SkS10L-uM9lwa5bnHnY2Vs_oLo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 20736 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MIv9D6qCD557UuGj0WYoB1KLme1Sh1mzoR8zvP-Ew1rDY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 13] + MIWaveTileA: 4 + MIWaveTileB: 13 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 208 + MacroTileA: 256 + MacroTileB: 208 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 8 + NumLoadsB: 13 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 13 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 13 + ThreadTileA: 16 + ThreadTileB: 13 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MIZTzgVMYbF8zAjkLbWPx_KguNjRPGDWzQphgfwu106mI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 28160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 11] + MIWaveTileA: 4 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 176 + MacroTileA: 256 + MacroTileB: 176 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 44 + NumLoadsA: 8 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 11 + ThreadTileA: 16 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MIYpXgEPF2KmPhp_HSItkaLdu8ZfTql6wuzRMctpM8bWs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121856 + LdsInitCVgprs: false + LdsNumBytes: 121856 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MIM6ky9Tq6VRezat1v-Vy4Qy4OEEUGyFJHwE-XvlIpRaY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI1ZaT28UTzCJBM6wCUWArJk9cu4afCdA6oGhULOQC7jwQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI1ckLt2U3Q4BSlwSpNR-cpd6uJ3sMr5OVPRWuKvEiSVtY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI1Cks6yoWu7mRHY5OU_L0wlyt_BkdBZ9B6KYLX0jMjEZM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35840 + LdsInitCVgprs: false + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI1LRieWm8obTv5BrugK6ZfaH9J4-zT3LrRBcx6GxwPdUk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 1280 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45312 + LdsInitCVgprs: false + LdsNumBytes: 45312 + LdsNumElementsAlignedA: 10496 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 10496 + LdsOffsetB_Blk: 76032 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45312 + LdsOffsetMetadata_Blk: 76032 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI1zBVp9O_j-pDQmFJ0pruVZjPjyXjbaxIDNKuOKzE-hUk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 6400 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI1LmEbjkFjRe2EHvxZ5njVJSBpnw78dqd37eC05zoq8oY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 67840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 67840 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI_Cpsqq7IdCcI6m09ixnRrSz_HTRJbwY41LkpuZ6eaow= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI1tP0yLw5sOaqSqLw69T0rUISjIec4gtRcEBnLindXxd0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI1WlKSkQYWc45q9-9u3Sa8KR5G5YSwdMXXvYIooja5mzU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI1ZFz2Wqi9AGPfO9YZeT8hu0U-a6QRpBHZ1MNZhOuWfzs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16YTIDOkvWpHTvJMbXMK3Qn2TUcHpLDIz3L67zYqwLtxM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 24832 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24832 + LdsOffsetB_Blk: 90368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 90368 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI1Hyiuad9E_LwogHYOQgHONBkR83eXTUE_-HwHw1z6c2M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62976 + LdsInitCVgprs: false + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI1_vkhGjNt0PNt_xV8VVOz2XzUCfwEcU8zljVlbKTICVg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16riuLixhIaRRQRAFH27-FM-5iTnYrSUKGh2HFdVNH4W8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16tRLsgniRf9NQ8cwUknJpPeoosk2I0euxrG_yXuRD6Zc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16zCjLhPMzebcvjlA5OUrujTBGN1OYulB3ktj1a570vLs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI1OuevmgTEfRQM80_r7hzorlLPNIZiSHCEk0LhjgcOAH8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60672 + LdsInitCVgprs: false + LdsNumBytes: 60672 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60672 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI117HfbCeMXp_75IhCNgyVsAtudVa47Y080duX4tlb_Cc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 54528 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI1w_TVfNrjE3mYvq96Bm1Jgkl-HCbXvzNQ6H6gr4PSnKc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16IX89BCLJtYzlNb-bQG51jUiesgW7gn9abBGKzaC4u7Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 73984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 73984 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16yEC0PHvmD0zrKvRDTkhqsmiXq78VvD0Eg5fcAnocoic= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16_OtB8fTrZk3Pt0bE3bXfsWfkoWXEPa2ENPfe4eaiVRI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI1uaMQi1863lUOUemwTE99mttXxpTRkvqvmiwvtfx6Qx0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 80384 + LdsInitCVgprs: false + LdsNumBytes: 80384 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 80384 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI1_aftRJpk6u1tTs3-Hwls-Fo3A3dD889Os-hWuvfNQvw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71168 + LdsInitCVgprs: false + LdsNumBytes: 71168 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71168 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 64 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA64_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI18CjcSziK0PjBic2MAopWR-Ta_nHbDse6qNtds5pye7Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 16 + LSCB: 256 + LSPA: 256 + LSPB: 16 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71936 + LdsInitCVgprs: false + LdsNumBytes: 71936 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 135424 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71936 + LdsOffsetMetadata_Blk: 135424 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16-AdnHWdGaF-mVZCwCYUd2wq3b2fSNewc3-7rmRp2b3Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16pBqfsRIAkQBDjOQRcdakPvu1rZh51vJuBh1AVswNR5Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI169OaDNKWZnXMa1R_AlPEgQpVqAFVb54TBTn3PvUMxY7A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16aFeBRs_oObL14ukkJN1DauqQpjn3rpabfuRB4QWnh0o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16JRL6UviIw3ql2MWJlkjDhAPVUQuM-dfo_YP9V8INeCE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16a9rQBxUkyvDWlmJquyIGWqzMINaNpRD_aWL9QWIuzQ8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 512 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16Jyq1ax9kWbpfAk08jNUVf7z1XoXRVCVdpt-kBpa0fWY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 512 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16D0scrs9_FOYkVHUHqfaWIBeu8Rz72d3pOV3R8iV8H74= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI169Ao_eCjaajChoPx9i0y3MGqMyeJdbqU3dbjwB_UIKx4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 + LSPA: 256 + LSPB: 8 + LVCA: 1 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI1DDaFTcewwV5Ybn8swfgQtfH13aPb98dLOnRrTIJYmSQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI1ifWE51xli667eSWHdf8X0Gqm1isofKMCZj08QiVl3U0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 1024 + LSPA: 128 + LSPB: 4 + LVCA: 2 + LVCB: 64 + LVPA: 8 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI1U06udU6pkFuvB0exyiBHs-wOgMdXOgbcv4nENk4uw44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI1oDw0RAqdNEFhP7idqV0O87totk9ugWGoiXKFb7JRQVw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 1024 + LSPA: 256 + LSPB: 4 + LVCA: 1 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84992 + LdsInitCVgprs: false + LdsNumBytes: 84992 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84992 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 16 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16xqwn4mYe9PqjtRIapRznDsflXKMnaPO3v98RcshyCRoc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml new file mode 100644 index 000000000000..2ee25b19761f --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml @@ -0,0 +1,27357 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI93W5TiEccmFg8wGd3V3_i7anSx2in1g2Nvgr1nxFMW0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI1WBDB2zkUie8-CEEoI_ArcQdbGzpndP9LCMEirfw4oOM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46336 + LdsInitCVgprs: false + LdsNumBytes: 46336 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46336 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI1byiwSEFKpQOHfD3AL3aMNi54Gti0tPrWia0W_RI3F9I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI1drE7ijfboWVMJx6XbcD7R5giTiA773IMYwA0SWdbSew= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MIlwPgpsfEr1tHJhWc_dQ16xuUglNHBRvb9FY8kgrV2uo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 128 + MacroTileA: 224 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI1vpzVkbJ5WB1luwHLUPLp6GT1RJDqsKYFBruB0CuPp3k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48384 + LdsInitCVgprs: false + LdsNumBytes: 48384 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48384 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 96 + MacroTileA: 224 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI1zuV98IyOwxElPoy5ichUD4fa6nssRrdqd1Q3modyHdk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44288 + LdsInitCVgprs: false + LdsNumBytes: 44288 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44288 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI1xqlNtEYE-E0Mz0QATa9_dzumE9-a3Wvr1cR9tzlzYW8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MICgefWmv3HWofIYKXJsq8855MHgkupzgrkVMcs4u-by0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125952 + LdsInitCVgprs: false + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MInHiNZ23NKfFsK7oWehXsdVhdC3keTzJy0tZ_ibW9pzc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI1rAFgL4PTZQ2gx_CQR3J0YNYjnR6ZpZqLT3kKoVcsb8Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI1Vg4WjQQgNmrK6Ytw7ZwOj3TBnXGNte5TZoyB4kGGvxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI1WRxklg9QBBuMW_MrlGv_hyeD_1KjPxvE5CvBb2xGGdk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32000 + LdsInitCVgprs: false + LdsNumBytes: 32000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32000 + LdsOffsetMetadata_Blk: 60416 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MIGmZq9BXDiHJ0tc5zxXOziePTEBPwz6cUZrAUknIRT7E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MIwjK4OAX-B9dLgo7aLraklJg0ipZLFjGXad70tFlHa_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50432 + LdsInitCVgprs: false + LdsNumBytes: 50432 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50432 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 6] + MIWaveTileA: 5 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 192 + MacroTileA: 160 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 6 + ThreadTileA: 20 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MIXGlL2yJ1nYGKMcvOKU86eOyg9fj72cyU6dwaNXql1aE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI1d_LfxY9vu-XLyyQyjblMOjGGccv6SWDTcAMP2nTGrZ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI1xJMqskwq5v3yaRA45HS_l-Q0PeuwEvXnjkyc8iEsIuQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI1J2YusILFQCwZJTPycybXWTPnKyMth_wvsZJGJUL6XBA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 58368 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MIK3HX_ARTjdrJyQgn8ka1A08cR0NqiFPRF8k7JYx3lno= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MIAYYyDwfBJqmmCRgY0wORDPvjgSZIA73oaclExRH_3Ks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MIL86z76qmkuTn8D76ItocU1Fqkm1UTu79IZpG8k_14Mo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MIcEWO-iBcRFoqXFm5FnHIJnGhh114ULGYJcbTf6WzdFI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI1uj-yswGYT2k5VXIiR3VHictFIePTXmx0STeF7oDnJ6c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI1_uypCl737WTznRBFG8qQSTk3KV2VA62EV3JnGAPhybs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI1SN9OiW0wmLBhcMdgMSszjYfbUYfIwyxoyCZVswMyYtQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI1OdZvyKg_z2Mm3twNkO33DsDFI4XHPhixfFGT8M7-fns= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48128 + LdsInitCVgprs: false + LdsNumBytes: 48128 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48128 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI1-rYnfwe4wvrEHrwuVO2V79PzVjbfHc2PuGqRKVJDgfI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI1QbCX-NbN9_FulNkEv2WcOQVCSkdBBRKzE6PDyeD0PIc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI15g6oi8kYRfTdSkq4hb7l4vHYA3S6a_b3gQ32PTizD-c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16zaeoUJs0j0gHGLtiiAi_mKOg4oYgz97kp3_f949MqD0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27904 + LdsInitCVgprs: false + LdsNumBytes: 27904 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27904 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16n1I05diAnU09ahBMg4ZOJ69DUU98umJt7phil649P_I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23808 + LdsInitCVgprs: false + LdsNumBytes: 23808 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16qYVsRYavL1_uEB81mfehkE0xypBMpwpmv8NTKwWUCZQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 19712 + LdsInitCVgprs: false + LdsNumBytes: 19712 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19712 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI1rLkRPc5Fv6KcZ0pkpjti_dDNkwatHGJAjTt5YgSUF0A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI1MsUtatzu8gT7lwtRqOR1GlkYA3LAi1h7-WpiR8njS-s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI17ixososrZhbGip5glmYhzY0dhoWQ6s92NzkMuYpNd6c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI1O6oTiyV66Nmoj0i8-AQlJj5R-NLtNsfH6diaQrWbGzQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16BDy2huNg1pYAEVpwRmCSx6TZ3y-fyTBhxwPrXBTUMn0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16T76NxLk5ebc1y9bf0bGaRoLE1Btglvx1amB35nt2jrg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI164mp4kBtFpiXW7TfMmUnYxQd7fjnanNrZue8IY8R1Nt4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI1CBdovKQljtt9JJeN35QUWWaGXoZh_9PeLDfb3HuWMiA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI1eYnh8ExIG9i1WyAXPWhGKC2q4srWo55-2vcORDH9kEs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI1m1tiKe28X_8BSOyU__5gEgsr8TQY2h1rX5gF4hmE4Qc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI17VRJH_5pnmfoUU-FHr32iGMlOdeQUYJplapfD41zqrY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22016 + LdsInitCVgprs: false + LdsNumBytes: 22016 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22016 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16AqtF9WEJ-EpQbhJnOZKbC372wZ4o2kh1rPtJZau1G1U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16WqqFFjCIevU2bTOwvq53GHxOVUs4xuB_iw0OyIMRO9s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16O0pSqsQ7I3lGvs23ES_yKR_wzzq3HSwM1IcyJmGtnhc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25728 + LdsInitCVgprs: false + LdsNumBytes: 25728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI1Jybduy0hX_yQp-cMS6QlHSWe9YueMgbIOjgYTfyE4xY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1280 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45312 + LdsInitCVgprs: false + LdsNumBytes: 45312 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 10496 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45312 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI1Mhf-Vi17l2Fosir59CVL3Da6ElGCICe4ioDdRimtZBI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI1zfbotELeA8YVwjd3HOC5VjWft_ZNaVArBeo8Zgzp6wo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MIPJDSEwAxzWM4fDuAEI3dit5Qm8ipFSRMSAIZnl6eve4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 4] + MIWaveTileA: 13 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 208 + MacroTile1: 256 + MacroTileA: 208 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 208 + NumLoadsA: 13 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 13 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 4 + ThreadTileA: 52 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MIHlGj0Igh78VESckoEVhEEojHcEyAVYdCerN_cX6Ldrw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 28160 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [11, 4] + MIWaveTileA: 11 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 176 + MacroTile1: 256 + MacroTileA: 176 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 176 + NumLoadsA: 11 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 11 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 44 + ThreadTile1: 4 + ThreadTileA: 44 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MIs43vjb8vXLGfeBgy4ohTkGSxXW8PsCp3uz_zeqpVxeM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121856 + LdsInitCVgprs: false + LdsNumBytes: 121856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 4] + MIWaveTileA: 9 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 144 + MacroTile1: 256 + MacroTileA: 144 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 4 + ThreadTileA: 36 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MIYY998OJmuRcBnYy6KGEItwDq2j2QehUOs8VfioGt_EY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 83456 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 256 + MacroTileA: 112 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI1riWq2NrSVGbHY4ulT3CBZ1ka9jZSPImqGDsOLfR739U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI109tIWmrNFpIMCvCKD44FEjLK6mNrPP0ecMYRKJWv__o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI1ePlkOMYOjA6Sj3U5aK9R-vdkoYkQzZKOf49wW2bLE70= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35840 + LdsInitCVgprs: false + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 68096 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI1R3pW5TGY8A5opfshY0CC3g5uD4DXiGRwS1VnQHSdtTs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60672 + LdsInitCVgprs: false + LdsNumBytes: 60672 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 117760 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60672 + LdsOffsetMetadata_Blk: 117760 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI13Sbf9PQEx5F-iKE9julI7_PdDt3yPapFP4vCFUpDo1E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62976 + LdsInitCVgprs: false + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI137t5Uvc0ssl50sE4SNlCcUp-XUoCcLX9_I4uGMXledk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 54528 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MIOmRHwGdcC8gFOr3dPzasFfQWs0PP2cDcwy6aKZU2f5U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI1454Bwxd_MIvQjuG-Y_CvHs_qnR7PgRRvHGhCUO-XbOY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI1Rmqg4myY3vDAMf_l0hT_13f8mLa__KusFSpIvX00Gq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI15NEBMkxG3p9Ig0Cr9U7Vscm7bIb_z6qr2ylZCFd2y30= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16wxJkzA64nljgT6A4tMYPR-qnvL3NK7-LGLeTrL93K28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16s4XHH5Mm_pX3J5f5thK2zPFz4wk3wfVK1ZsVsKy_LpE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI1XeU_kq-TCV7gFRA0zjadiPg5xVzK4J_9tcfhp6RHcE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16uy_AuCnsXVT_W8ZUwPY2xZhbdvr0s1-k6m6FDS2GyPc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16JHWtGGP8udHdAtFC_ITt55YeNZjw7qy9YbaDQmBmLnw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI1CMycMeJchCKewZBbLPkfdDttQ7jpxPNi2DcgEi0aA7Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI167PfjxLOUHYcmsWqM3Jr5QHvep_HsB981kR15bVmyR_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16hW4G8jnTFUiWZWnW0Tmfmi9zpySUXkWHngzRijqsswA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16pkwZPXEjbcu5SfItYZgOkhxquryDik7C3Mlw6DhdC9c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI1ag0qOUjDoYcyoioR4OdNEh8L5HMCVTkf-1kVLG0Ke8Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 256 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71936 + LdsInitCVgprs: false + LdsNumBytes: 71936 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71936 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI1RElBQ1lA5tfxLjjMGZY4_t9WElkqvIrWw-ZvypHxsS4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 80384 + LdsInitCVgprs: false + LdsNumBytes: 80384 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 80384 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI1g4O7XQYq5Erjhq0mbnd0ZzsOn7Y74NZ0Ed50NCU8Cko= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71168 + LdsInitCVgprs: false + LdsNumBytes: 71168 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71168 + LdsOffsetMetadata_Blk: 135680 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16BfEq8Qvc7y_UvNL_Jl4pNSs8RolDE0lSBE0kSa4q37k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16802-xq_nYZ8mMbx6wb5jMH9gLrNlJOzl_bKrKs1SBqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16c-3ymAqE5iIzAVIl4M4mFas_yKBzwB8S5hIJHrx_G10= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16wOpiFFua6FkiLWfcBJsaLKAZCjHhoW8GSOhXtRo-XIc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16l2koiE9edxgF5EcPYxeXQhdktTUf3O_Nh5I8-rOv_5c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16PDTjUmNdBPeXIoutsSh54JIRkDgOgTuGJ5_UUnXYC30= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16FZJXxZzBUYpkHY1uHTOo5xfvmHSftJ7Ss8JFmsnHOq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16UExxzleg69jKCOIPv3aHKRuxpeCJ0OtN8U8_i8380wE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16486QKgKm2Q-ueCL9nw3616pMoSPYJwy96wpaKy2ZR9g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI1gJ-iaCReTZwgtOaq-r_7Ae1Xx5lpHHILXpRm4hXIFWo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI1TOymj1f06yNxQYQNnDElBwV9Je3kDkNSQdL4ZFlGHxw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI1eVyecsmYTDI_tnvIi0T4aak3HoYfyCs1_BMjsWhSVGA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84992 + LdsInitCVgprs: false + LdsNumBytes: 84992 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84992 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI1SvhFCnFpOT5VXBaOtUT022uRF_6hTz1blD7UqHf2kpA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 32 + LSPA: 4 + LSPB: 128 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16xDWExf5YqS6e0I09q-b7LtSPBEclPiz1pvMgNxDpD5yY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 000000000000..4ce861d61db1 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,27357 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI93W5TiEccmFg8wGd3V3_i7anSx2in1g2Nvgr1nxFMW0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI1WBDB2zkUie8-CEEoI_ArcQdbGzpndP9LCMEirfw4oOM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46336 + LdsInitCVgprs: false + LdsNumBytes: 46336 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46336 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI1byiwSEFKpQOHfD3AL3aMNi54Gti0tPrWia0W_RI3F9I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI1drE7ijfboWVMJx6XbcD7R5giTiA773IMYwA0SWdbSew= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MIlwPgpsfEr1tHJhWc_dQ16xuUglNHBRvb9FY8kgrV2uo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 128 + MacroTileA: 224 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI1vpzVkbJ5WB1luwHLUPLp6GT1RJDqsKYFBruB0CuPp3k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48384 + LdsInitCVgprs: false + LdsNumBytes: 48384 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48384 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 96 + MacroTileA: 224 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI1zuV98IyOwxElPoy5ichUD4fa6nssRrdqd1Q3modyHdk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44288 + LdsInitCVgprs: false + LdsNumBytes: 44288 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44288 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI1xqlNtEYE-E0Mz0QATa9_dzumE9-a3Wvr1cR9tzlzYW8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MICgefWmv3HWofIYKXJsq8855MHgkupzgrkVMcs4u-by0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 125952 + LdsInitCVgprs: false + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MInHiNZ23NKfFsK7oWehXsdVhdC3keTzJy0tZ_ibW9pzc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI1rAFgL4PTZQ2gx_CQR3J0YNYjnR6ZpZqLT3kKoVcsb8Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI1Vg4WjQQgNmrK6Ytw7ZwOj3TBnXGNte5TZoyB4kGGvxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI1WRxklg9QBBuMW_MrlGv_hyeD_1KjPxvE5CvBb2xGGdk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32000 + LdsInitCVgprs: false + LdsNumBytes: 32000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32000 + LdsOffsetMetadata_Blk: 60416 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MIGmZq9BXDiHJ0tc5zxXOziePTEBPwz6cUZrAUknIRT7E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MIwjK4OAX-B9dLgo7aLraklJg0ipZLFjGXad70tFlHa_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50432 + LdsInitCVgprs: false + LdsNumBytes: 50432 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50432 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 6] + MIWaveTileA: 5 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 192 + MacroTileA: 160 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 6 + ThreadTileA: 20 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MIXGlL2yJ1nYGKMcvOKU86eOyg9fj72cyU6dwaNXql1aE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI1d_LfxY9vu-XLyyQyjblMOjGGccv6SWDTcAMP2nTGrZ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI1xJMqskwq5v3yaRA45HS_l-Q0PeuwEvXnjkyc8iEsIuQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI1J2YusILFQCwZJTPycybXWTPnKyMth_wvsZJGJUL6XBA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 58368 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MIK3HX_ARTjdrJyQgn8ka1A08cR0NqiFPRF8k7JYx3lno= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MIAYYyDwfBJqmmCRgY0wORDPvjgSZIA73oaclExRH_3Ks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MIL86z76qmkuTn8D76ItocU1Fqkm1UTu79IZpG8k_14Mo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38144 + LdsInitCVgprs: false + LdsNumBytes: 38144 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38144 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MIcEWO-iBcRFoqXFm5FnHIJnGhh114ULGYJcbTf6WzdFI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI1uj-yswGYT2k5VXIiR3VHictFIePTXmx0STeF7oDnJ6c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI1_uypCl737WTznRBFG8qQSTk3KV2VA62EV3JnGAPhybs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI1SN9OiW0wmLBhcMdgMSszjYfbUYfIwyxoyCZVswMyYtQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI1OdZvyKg_z2Mm3twNkO33DsDFI4XHPhixfFGT8M7-fns= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48128 + LdsInitCVgprs: false + LdsNumBytes: 48128 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48128 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI1-rYnfwe4wvrEHrwuVO2V79PzVjbfHc2PuGqRKVJDgfI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40192 + LdsInitCVgprs: false + LdsNumBytes: 40192 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40192 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI1QbCX-NbN9_FulNkEv2WcOQVCSkdBBRKzE6PDyeD0PIc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI15g6oi8kYRfTdSkq4hb7l4vHYA3S6a_b3gQ32PTizD-c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 + LdsInitCVgprs: false + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16zaeoUJs0j0gHGLtiiAi_mKOg4oYgz97kp3_f949MqD0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27904 + LdsInitCVgprs: false + LdsNumBytes: 27904 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27904 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16n1I05diAnU09ahBMg4ZOJ69DUU98umJt7phil649P_I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23808 + LdsInitCVgprs: false + LdsNumBytes: 23808 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16qYVsRYavL1_uEB81mfehkE0xypBMpwpmv8NTKwWUCZQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 19712 + LdsInitCVgprs: false + LdsNumBytes: 19712 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19712 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI1rLkRPc5Fv6KcZ0pkpjti_dDNkwatHGJAjTt5YgSUF0A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI1MsUtatzu8gT7lwtRqOR1GlkYA3LAi1h7-WpiR8njS-s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI17ixososrZhbGip5glmYhzY0dhoWQ6s92NzkMuYpNd6c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI1O6oTiyV66Nmoj0i8-AQlJj5R-NLtNsfH6diaQrWbGzQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16BDy2huNg1pYAEVpwRmCSx6TZ3y-fyTBhxwPrXBTUMn0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 21760 + LdsInitCVgprs: false + LdsNumBytes: 21760 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 21760 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16T76NxLk5ebc1y9bf0bGaRoLE1Btglvx1amB35nt2jrg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI164mp4kBtFpiXW7TfMmUnYxQd7fjnanNrZue8IY8R1Nt4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29824 + LdsInitCVgprs: false + LdsNumBytes: 29824 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI1CBdovKQljtt9JJeN35QUWWaGXoZh_9PeLDfb3HuWMiA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB0_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI1eYnh8ExIG9i1WyAXPWhGKC2q4srWo55-2vcORDH9kEs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 29952 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI1m1tiKe28X_8BSOyU__5gEgsr8TQY2h1rX5gF4hmE4Qc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 20736 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI17VRJH_5pnmfoUU-FHr32iGMlOdeQUYJplapfD41zqrY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22016 + LdsInitCVgprs: false + LdsNumBytes: 22016 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22016 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16AqtF9WEJ-EpQbhJnOZKbC372wZ4o2kh1rPtJZau1G1U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 12544 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16WqqFFjCIevU2bTOwvq53GHxOVUs4xuB_iw0OyIMRO9s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29952 + LdsInitCVgprs: false + LdsNumBytes: 29952 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16O0pSqsQ7I3lGvs23ES_yKR_wzzq3HSwM1IcyJmGtnhc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25728 + LdsInitCVgprs: false + LdsNumBytes: 25728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI1Jybduy0hX_yQp-cMS6QlHSWe9YueMgbIOjgYTfyE4xY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1280 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45312 + LdsInitCVgprs: false + LdsNumBytes: 45312 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 10496 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45312 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI1Mhf-Vi17l2Fosir59CVL3Da6ElGCICe4ioDdRimtZBI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41216 + LdsInitCVgprs: false + LdsNumBytes: 41216 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 6400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI1zfbotELeA8YVwjd3HOC5VjWft_ZNaVArBeo8Zgzp6wo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 32 + LSPB: 128 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37120 + LdsInitCVgprs: false + LdsNumBytes: 37120 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MIPJDSEwAxzWM4fDuAEI3dit5Qm8ipFSRMSAIZnl6eve4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 4] + MIWaveTileA: 13 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 208 + MacroTile1: 256 + MacroTileA: 208 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 208 + NumLoadsA: 13 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 13 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 4 + ThreadTileA: 52 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MIHlGj0Igh78VESckoEVhEEojHcEyAVYdCerN_cX6Ldrw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 28160 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [11, 4] + MIWaveTileA: 11 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 176 + MacroTile1: 256 + MacroTileA: 176 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 176 + NumLoadsA: 11 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 11 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 44 + ThreadTile1: 4 + ThreadTileA: 44 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MIs43vjb8vXLGfeBgy4ohTkGSxXW8PsCp3uz_zeqpVxeM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121856 + LdsInitCVgprs: false + LdsNumBytes: 121856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 4] + MIWaveTileA: 9 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 144 + MacroTile1: 256 + MacroTileA: 144 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 4 + ThreadTileA: 36 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MIYY998OJmuRcBnYy6KGEItwDq2j2QehUOs8VfioGt_EY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 83456 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 256 + MacroTileA: 112 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI1riWq2NrSVGbHY4ulT3CBZ1ka9jZSPImqGDsOLfR739U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI109tIWmrNFpIMCvCKD44FEjLK6mNrPP0ecMYRKJWv__o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI1ePlkOMYOjA6Sj3U5aK9R-vdkoYkQzZKOf49wW2bLE70= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35840 + LdsInitCVgprs: false + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 68096 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI1R3pW5TGY8A5opfshY0CC3g5uD4DXiGRwS1VnQHSdtTs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60672 + LdsInitCVgprs: false + LdsNumBytes: 60672 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 117760 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60672 + LdsOffsetMetadata_Blk: 117760 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI13Sbf9PQEx5F-iKE9julI7_PdDt3yPapFP4vCFUpDo1E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 62976 + LdsInitCVgprs: false + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI137t5Uvc0ssl50sE4SNlCcUp-XUoCcLX9_I4uGMXledk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 54528 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MIOmRHwGdcC8gFOr3dPzasFfQWs0PP2cDcwy6aKZU2f5U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI1454Bwxd_MIvQjuG-Y_CvHs_qnR7PgRRvHGhCUO-XbOY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI1Rmqg4myY3vDAMf_l0hT_13f8mLa__KusFSpIvX00Gq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42240 + LdsInitCVgprs: false + LdsNumBytes: 42240 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42240 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI15NEBMkxG3p9Ig0Cr9U7Vscm7bIb_z6qr2ylZCFd2y30= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16wxJkzA64nljgT6A4tMYPR-qnvL3NK7-LGLeTrL93K28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16s4XHH5Mm_pX3J5f5thK2zPFz4wk3wfVK1ZsVsKy_LpE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36096 + LdsInitCVgprs: false + LdsNumBytes: 36096 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI1XeU_kq-TCV7gFRA0zjadiPg5xVzK4J_9tcfhp6RHcE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16uy_AuCnsXVT_W8ZUwPY2xZhbdvr0s1-k6m6FDS2GyPc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16JHWtGGP8udHdAtFC_ITt55YeNZjw7qy9YbaDQmBmLnw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25856 + LdsInitCVgprs: false + LdsNumBytes: 25856 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI1CMycMeJchCKewZBbLPkfdDttQ7jpxPNi2DcgEi0aA7Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI167PfjxLOUHYcmsWqM3Jr5QHvep_HsB981kR15bVmyR_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34048 + LdsInitCVgprs: false + LdsNumBytes: 34048 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 24832 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34048 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16hW4G8jnTFUiWZWnW0Tmfmi9zpySUXkWHngzRijqsswA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 64 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16pkwZPXEjbcu5SfItYZgOkhxquryDik7C3Mlw6DhdC9c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17664 + LdsInitCVgprs: false + LdsNumBytes: 17664 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI1ag0qOUjDoYcyoioR4OdNEh8L5HMCVTkf-1kVLG0Ke8Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 16 + LSPB: 256 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71936 + LdsInitCVgprs: false + LdsNumBytes: 71936 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71936 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI1RElBQ1lA5tfxLjjMGZY4_t9WElkqvIrWw-ZvypHxsS4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 80384 + LdsInitCVgprs: false + LdsNumBytes: 80384 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 80384 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI1g4O7XQYq5Erjhq0mbnd0ZzsOn7Y74NZ0Ed50NCU8Cko= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71168 + LdsInitCVgprs: false + LdsNumBytes: 71168 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71168 + LdsOffsetMetadata_Blk: 135680 + LdsPadA: 32 + LdsPadB: 64 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA32_LPB64_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16BfEq8Qvc7y_UvNL_Jl4pNSs8RolDE0lSBE0kSa4q37k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16802-xq_nYZ8mMbx6wb5jMH9gLrNlJOzl_bKrKs1SBqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16c-3ymAqE5iIzAVIl4M4mFas_yKBzwB8S5hIJHrx_G10= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16wOpiFFua6FkiLWfcBJsaLKAZCjHhoW8GSOhXtRo-XIc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16l2koiE9edxgF5EcPYxeXQhdktTUf3O_Nh5I8-rOv_5c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16PDTjUmNdBPeXIoutsSh54JIRkDgOgTuGJ5_UUnXYC30= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16FZJXxZzBUYpkHY1uHTOo5xfvmHSftJ7Ss8JFmsnHOq4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 8 + LSPB: 256 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16UExxzleg69jKCOIPv3aHKRuxpeCJ0OtN8U8_i8380wE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 32 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16486QKgKm2Q-ueCL9nw3616pMoSPYJwy96wpaKy2ZR9g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 64 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41984 + LdsInitCVgprs: false + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI1gJ-iaCReTZwgtOaq-r_7Ae1Xx5lpHHILXpRm4hXIFWo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI1TOymj1f06yNxQYQNnDElBwV9Je3kDkNSQdL4ZFlGHxw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI1eVyecsmYTDI_tnvIi0T4aak3HoYfyCs1_BMjsWhSVGA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 16 + LSPA: 4 + LSPB: 256 + LVCA: 64 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84992 + LdsInitCVgprs: false + LdsNumBytes: 84992 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84992 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI1SvhFCnFpOT5VXBaOtUT022uRF_6hTz1blD7UqHf2kpA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 32 + LSPA: 4 + LSPB: 128 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16xDWExf5YqS6e0I09q-b7LtSPBEclPiz1pvMgNxDpD5yY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6464 + LdsInitCVgprs: false + LdsNumBytes: 6464 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 3 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml new file mode 100644 index 000000000000..3bbc9c8ab44c --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs.yaml @@ -0,0 +1,44324 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x128_MIrZs5mdCNmpikVNpV2YQQNFBCAdivVS8D1AsBXrhUP4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x224x128_MIADOZHpI0e4kz3PekZsq-nbyBZU7EZIxEg7KcvUC6g3g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MIszVB7i9sCthoswarzra0qLsMXdGUHQA9HKGzu-1z5Kk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MI4oZDINgtVU-f19tz2HYWh_LmdGFiZplMuFYBiLKIRVE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124928 + LdsInitCVgprs: false + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MIQveu_FRPQ7kB2acKv85tmm0lWhIAYF1e-bVgSb8HhRU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI1nNTR_ZcKmsj4A34uOMZTZX_oilYaQuaQFL2SNhs8cXo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI17E3Elp9R2iGRIjoSUJt7ENU0IkIEDCYD27G5goO_zfQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI14oL9i2wAQa7K6yEREmecBQB_z40_9_5z5qamIYTi9BA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38912 + LdsInitCVgprs: false + LdsNumBytes: 38912 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38912 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x256x128_MIVqWf-zHb4fGrqaI_PNfaMIuBSbf24WaOtDVKEmRwTy0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 166912 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 8] + MIWaveTileA: 7 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 256 + MacroTileA: 224 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 8 + ThreadTileA: 28 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x224x128_MI2FE1_xdi6gisBeCfrvgksfzzbx9Yx6-LxacJqFoorOs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71680 + LdsInitCVgprs: false + LdsNumBytes: 71680 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71680 + LdsOffsetMetadata_Blk: 166912 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 7] + MIWaveTileA: 7 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 224 + MacroTileA: 224 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 196 + NumGlobalWriteVectorsPerThread: 196 + NumLoadsA: 7 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 7 + ThreadTileA: 28 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x192x128_MItgCKUscKBBiMtBOmdwUt6qAHeM_MR7YbKWEkh39p8UQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 6] + MIWaveTileA: 7 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 192 + MacroTileA: 224 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 168 + NumLoadsA: 7 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 6 + ThreadTileA: 28 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x160x128_MI_UOU6HZrGnyaniPidkPyA2vc3eMgOLn6XHhdMyr3gok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 160 + MacroTileA: 224 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 7 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MIdhuZzZrmslv6fIdikHYnhn1dpXNXnkfYYXLJoFZzt6s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 128 + MacroTileA: 224 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI1b5RoEaxogDkty9pH0a4h4iqEK74VHsxBERA-H-NeUNY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 96 + MacroTileA: 224 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI1g3lxPhwHBbhj-5_vMz-ezv_Gct2IZefQGw3UsfUuGuM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI1pRHgvlAR7N2hth7GOj12UHkPW1O97Tnl14UAnpJSv9c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MICt7le9q16ngZmPo_ZNYkoWOjOg2jVjonbKgLeTEzLxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x224x128_MIFCEugnbAE1o_KNOxQTjmgk1-riRp7-bqr1XYLHQkDJI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x192x128_MIsXAzSnuik23-u_Tqj9IA9yIVlTTHvUnKVAh5tdPfE-0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MIdawCsxOV8Q1yfZkDghTj5ROObRZEHHk5r8TLSJ8-4DI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI96PVzcjbGKsmGAPHOizb5QovZZPMnBXqYWXYWCMR6RE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI1XF9bGkyS2smPrKpZKRvXqZE_IlG80gFUg26F5mTZLis= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI1cyGRGs5fNOjzNULtz2NEdDFcpvgD-53w5wck9Mz0rEA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI1dXOVrMlvBwO6y7ESILkUB5p9GtB-Km9ykpVzg9YqhsY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 60416 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MI_FMKfIaIlw04mnTidFqT-Fnj0gDbPBKMLBy9AxGoipc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124928 + LdsInitCVgprs: false + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x224x128_MIgvcBHckHBXx7rYEXvd58PMUeCwZSGsDJtMgKSLGOngQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 7] + MIWaveTileA: 5 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 224 + MacroTileA: 160 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 7 + ThreadTileA: 20 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MIiC0cBFJQlkymU09JdEq7dUq1wvcSX9wghKOSQxkOErI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 6] + MIWaveTileA: 5 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 192 + MacroTileA: 160 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 6 + ThreadTileA: 20 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x160x128_MIwREMu9dXpHu0hNi1Te8Ryumj0HYpn1BFkusQoNfhAQ8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 5] + MIWaveTileA: 5 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 160 + MacroTileA: 160 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 100 + NumGlobalWriteVectorsPerThread: 100 + NumLoadsA: 5 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 5 + ThreadTileA: 20 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MIcrglZoxFbHHjwhvMHFjwAoFdYaccIqWMvEsbqy0yalI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI1BUsuevlFp3qE9qBkCpl8ZHtyZp2nTMvZX1nz1cQRT7M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI1IVM_kq6fQOztyL87pw0KKbmpiglUkaPXmejIGutUrh4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI1bgH1ukcWIV-QEjZUrEMgbOOF-CIy8Yu_mpQb3L9Zq3g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 58368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 64, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 64 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 64, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x256x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: 0 + enableLDSTrB: 0 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MIHAxeqAl5GZyYpWyj9YBlHLiAot0B1CVx9Nf_5OHRC1Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MIIlrnP4n1WnSi3fe0xoxepRfTuuLMMp-jgbnYdGnnvYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI5bZ7yznefmkxpzzbB-XiEKlKRJu2AMeEgvXx60Tq5L0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI72iEPgzvyM2ZrWa9mJE7BjRqJtpIrgM81I_tGNEjK0c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI1xf_ukgQ-Z-qNS4jmkabhCAr3mB7WAQFBVo0jxB_s6wM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI1j8uVA7i_pGx8ZCBrMR8ZhK3U6bp7J7YjtbYnnKdPxDI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI1M6uR2wC2h6sr2Hl9YrF8zhWGQ7RgoVsQ-TM7lCE6pPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22528 + LdsInitCVgprs: false + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI1lw4esJJcKXjrN7F_gKTEASvr4FPioQKVPOZImphrvBY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI1JixbiZckG_HlXVPjjWZHNInPUwLf2xyuU7BTFa5yHI0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 7] + MIWaveTileA: 3 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 224 + MacroTileA: 96 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 3 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 7 + ThreadTileA: 12 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI1wIqbeUYowBvH47eY0D90voHkVaoBg2lPdbX2d68nIJ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI1J-2M-3q4LSCntZjUh3v8nZH3qUcDC89rCJ0Fdq3iMeI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI1Vr1CG6o8iRf8BbhZJxSu0FVBXoX31LtlPuM8cs_YEXU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16Cnahg_B3KmoGxZkTO1OYZLBTHOrwHNY1VQd2t5IrylI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16W4yMmcRT_Rcp3ZNo2gSW3q4SJdd9i9DRRKcgqox59VE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16zNms4_iu6xgFQiq42WUbXrNnFNyslp04sbAKWAlTbog= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20480 + LdsInitCVgprs: false + LdsNumBytes: 20480 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI11aFcbk23grO4-hK8aBkA0Q2TJ_RzSnSefhCsshDFWE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI1BIQ9uHrC9E8FHJ9mzReo0log_q-mFe47Gvfw9lRTxKU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI1oWIDEj_bU2sIdW6WZSqKJI8Okx_mH0MwWvgKg4p0Wh0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI1DfS6ucItLGFYy3q0Zz-JZd7icG_ycHgazunawsT8Pns= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI1jK-MUAAqzxLFJYDxWtp_lRnOH6G_Sa3LS7gsDkzJ5ZI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16qBSTIiXAL1b2k0UyETAi0pu4GSfWwKXX5o-ybzBtncE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16mOGETtdY_W1srxs4Mfgd04W-cV79aCR6raMD3HA6jWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16wIclDOa00a_BcP3sOUDpDc_qFY9RvbDE777DV1a9O7k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI1pg1aJvOuqnmTxe_-2e9_1S4uFGoAaPhgR-zs28ui024= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38912 + LdsInitCVgprs: false + LdsNumBytes: 38912 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38912 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI1bJJ5iziycwP81Vg_rFbnRobq-WE2Gw-XuD7kvGV58LU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 1 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI1c60kFRwy-kwCbpVB3Be-Tp-X0igfgGU9wMtdCOAL5xM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI13_eugo-sWMKarMTBe1WI4hmzJdiGvToykcHBbj4WXow= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI1gSIhM4a60TQQmpGca3zPH_1C9YMOrBSb2Bu9JrxvD-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22528 + LdsInitCVgprs: false + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16YH7g6N9l5w5imlP3Lp9mdDXwtO_ZtzNO9yUXMFVOjKQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20480 + LdsInitCVgprs: false + LdsNumBytes: 20480 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16IFZMDGDs3-q6koIU2xIB_XUAfzhmng9Mg_46GO3Rqf4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16rLKaVqye5ZVlglTUO9ioHrZ6WB-GJ_Qs2RqP4S8ZIXE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x240x128_MIyoGz0G9DWmdNz6WjS2Bq6rCsCrR_mXM4td8eMkMYEDw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x240x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73216 + LdsInitCVgprs: false + LdsNumBytes: 73216 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 38400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73216 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 15] + MIWaveTileA: 4 + MIWaveTileB: 15 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 240 + MacroTileA: 256 + MacroTileB: 240 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 8 + NumLoadsB: 15 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 15 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x240x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 15 + ThreadTileA: 16 + ThreadTileB: 15 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MIBnc7kWhy6AvjwbNc5adjziCfJiLAVAD1RzoNcS6OMZQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 68096 + LdsInitCVgprs: false + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 13] + MIWaveTileA: 4 + MIWaveTileB: 13 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 208 + MacroTileA: 256 + MacroTileB: 208 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 8 + NumLoadsB: 13 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 13 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 13 + ThreadTileA: 16 + ThreadTileB: 13 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MIF5A-1D1ZIPjJsJJ7ttt4AlfEFPQG0riur7Zv_qXG5YI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128512 + LdsInitCVgprs: false + LdsNumBytes: 128512 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 28160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 11] + MIWaveTileA: 4 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 176 + MacroTileA: 256 + MacroTileB: 176 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 44 + NumLoadsA: 8 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 11 + ThreadTileA: 16 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MIyRCs1CYgk6qoLPVzZz2698DWR4wMJ7whNVJ8JwJfnZc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MIaVDafPaNDXaLtu56b1YkZsQUI1aLmxb3jgdu_dv65EA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI1jNVYmoC6fjSjbIADTf0f9l0Cs4TKifqjd6Xql0ehllk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 + LdsInitCVgprs: false + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI1eX_SljyDEcS5rCPMhk2caJwb3hhe7qqzmb_HRJ0i-U0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI1fPZ5I9MI2WZKscvBb8L9C1UGdH6oze92UNxsd_rmjZ0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT240x256x128_MI17whl_3k4Y1EhiqZzn7y-o-UTqlat-Sd3AsiBr6V3nc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT240x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73216 + LdsInitCVgprs: false + LdsNumBytes: 73216 + LdsNumElementsAlignedA: 38400 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 38400 + LdsOffsetB_Blk: 169472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73216 + LdsOffsetMetadata_Blk: 169472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [15, 4] + MIWaveTileA: 15 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 240 + MacroTile1: 256 + MacroTileA: 240 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 240 + NumLoadsA: 15 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 15 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT240x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 60 + ThreadTile1: 4 + ThreadTileA: 60 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MIjK835T0h-3Ar1N2b4kmmt65KtYLTJhfaZH3LmR2auFA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 68096 + LdsInitCVgprs: false + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 4] + MIWaveTileA: 13 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 208 + MacroTile1: 256 + MacroTileA: 208 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 208 + NumLoadsA: 13 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 13 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 4 + ThreadTileA: 52 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MIgsJynaoGp5Lr3OC9AUIiNlgAW9kkq7qPUmgK4ohOA_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128512 + LdsInitCVgprs: false + LdsNumBytes: 128512 + LdsNumElementsAlignedA: 28160 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [11, 4] + MIWaveTileA: 11 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 176 + MacroTile1: 256 + MacroTileA: 176 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 176 + NumLoadsA: 11 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 11 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 44 + ThreadTile1: 4 + ThreadTileA: 44 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MI6ZUiH-qsm4LTRuwyb7uw6oTXUaPI3Vjpt7WHaUbBOfI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 4] + MIWaveTileA: 9 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 144 + MacroTile1: 256 + MacroTileA: 144 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 4 + ThreadTileA: 36 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MIiu6Dap7c-mOezayg2ePWEWJWIUNWwq_GUAWuc6asW8k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 83456 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 256 + MacroTileA: 112 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI1fyJI8fnAEv3sOPHcVPqOC2AU9kSqOftCPxastr6HZJE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 + LdsInitCVgprs: false + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI1-Zo3r6x3T98ZMWTZ4H7LqLtk7xXmH_6zOjhPN09FdTk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI1wehOWUghk2A68IMMHO37hmuhc-kFrA6X9yfb9WRqhBg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 68096 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x256_MI1vCzuwEprjPexlyglCW3zRXsjX9xUmGeZzokhH60Z9yE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 75776 + LdsInitCVgprs: false + LdsNumBytes: 75776 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 75776 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x256_MI1SiLGOArtDjYGdvuIvIxglRBR2PASF7znHipyq968J9k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 195584 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 195584 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x256_MI1Atkrqghs5hZ7rFrRDMn1qRHk9P7RjRIX2jTHJaCZcIs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 195584 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 195584 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT224x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x256_MI1H6kBMzYhDsiXY1vBEU2Ud8UhLYehCYuWeQaCulrUTHo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x256_MI153vNbJPjGrVniCPDOzS5UGzSK6NOLhTT3dYrLoo1YJ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI1y0BaulKQmTtKr2XSp5P3dnaOK0k5UYUxfkhT1HxC_rU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 117760 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 117760 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x256_MIUYoy0JGLYkrhIavz9gjog-Elk__Z0jYq-QXQEcN2PaA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x256_MI1cUVPZBoZ-mtmE8sTWF_Az_FU-gB_6I8zhwfiN1pEaks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI1EM3GcfaRjwnLLnXwjtnf8XZHB699buVut-TXk4ZjWyc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI15N7cdBjXALKtpaYkT17ouFsMqBdqYm1rqUc2cC7GeLE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x256_MIU14cbwZ_t9NC-mBYr7N5MljJmeoGwBb-WUmjyy7ILhU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MIM08mmLtqT14x3TxGefabTcO290EciaeH6GnjDeU8Bgo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI1OG3k0SIeaxxBYmDDXCQJ0v824MXDB2cvmAH14Io8mlM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI1Ee_Jq2fBrVEHXaPp-qikzlaFXpjSm2wKDPvD-MKxLM0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI13IeiqjCNSTMel5JBLjnHPiB9PE_JYWIIfXqePFminMc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x256_MI1qUn-crPudL6dXZ6rCcKnkE9s4w-RcuNNqOhYdbIrR28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x256_MI18wp9zZc9oencMlWPaDY7BUc_iQgMxwF-RQOzHet7J3I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI1LrtbILHycv46qRt6NxJYDFm4zT_v-o9V1TAKrHNeAeQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x256_MI164Fx4NhdfXwLWoN6X-Afen5a5uy0a8LeKZgpjvani3CM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16_Nc2lY4fwUC-cbMivJ1U_eNf_9vhMW0WYu6jNMKBzYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16X2E4Wx9NVLCyWp4uy3bGT7QFxR20eVc2o2HQJqwi8Hk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x256_MI1ehl3l0UKCFUW0CqlYWmD5Gv6DpebCzV_7KPnbR_5y4Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x256_MI1PpwSxkO--tNKZnLh5u1Zqztz5GRs2qwDUtmWoQ32_0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI1BIcxM2OYKG0Jqn5GRpKyHvCNhj2aQceoHMvTm7lJo6Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI1-eL7HedP-uoMH-4uLF-vatbGDwtmp3GZhw8IuLzDHvE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16Tidk1Ou4UA72T_vnHnWAsymx18z-7WG5S7yV6EMzxAE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16P7nrnDgCqsPXJVFvUTHGSoy-i5STapWZDg4F8KHN2a8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI164FNCErQfeGGweRYwOlRLTlQQw-uihsMini7VG5MjqUs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x256_MI1Weg33rA_lR82ctIL5Wv-agG74LvDUGtagD8ZNXCm6m8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 75776 + LdsInitCVgprs: false + LdsNumBytes: 75776 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 75776 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x256_MI1yjWOfHPT7EqX9rhhXejY8_eiJ07brJZxCVBlCXR9DBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI1pVP6pjfyPWPVeerMg8FY1EyiGjH4B_j7tc4lRUzRPvg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI1q7mTED5UvVJrOPDvYFNPflDjUkX1okY5IJT3AZBddMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI1fFpYWYjPWsmrBYgoz9CIhz7OHVoysPvt9SgTel6GPWI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16mu07e3PYZ_jp2LQxX0lmvu4pmsw_265fxGpIWycVSjg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI162x9lNzvJGF2jhOPNNRp1Q3mFaUkUkcNWlM7ecLmXXfs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16-01AyvDsXfpTy-wvoK3K25-ZRKuFeCLIRsj5lVXzX_s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x256_MI1LDs52gy7NI4WuOMQ_idbNXBwBK86orsq1YVyOo4IvKs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x80x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI1NztuuZzeaShCMYTqz64dRhvSvIEetpxjBMHS12EujLs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81408 + LdsInitCVgprs: false + LdsNumBytes: 81408 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81408 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI1-eS3GREuZbaF_WcdTrJ8tJN1VRBcjitHDgD1jEhWCcE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72192 + LdsInitCVgprs: false + LdsNumBytes: 72192 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72192 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x256_MI1ZLUtUYYiOHV_3Bx1CweJ56xkOj29nbS2TUKPiIsgWeM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT80x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI1QlOH2GDodFrn6A0rxCKHrFBaU_QWfv580nm9xWdZY0k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81408 + LdsInitCVgprs: false + LdsNumBytes: 81408 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81408 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI1PZ1TyY_JO23lBPKX9VhACTOX37NNgYiBjf2A-tanW7w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72192 + LdsInitCVgprs: false + LdsNumBytes: 72192 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72192 + LdsOffsetMetadata_Blk: 135680 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI16Z6LYIKVcBKQJUXMfCTkH2UmyhUWm90vrVgFH5F5h_is= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI165IfRyrGVaZ277jt_y40d11CoSr5lg3fA8j83jyEgno8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16Gx0c38MGhJUAjADEf6sgUUCYEnSxT0HvEevn-TLzDBA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16F-32foTbZU7_IZ8fkLWy78MMRaTz5VOyQ3YshyDK2WM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16-AdnHWdGaF-mVZCwCYUd2wq3b2fSNewc3-7rmRp2b3Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16pBqfsRIAkQBDjOQRcdakPvu1rZh51vJuBh1AVswNR5Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16tO84-u8CmDozhzOYFPjZx1aAnLl_RdoGaIlGdosGUzM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16WaJJEhK9J396-Hl1mrtJrwp0VdJSLCW5ohgvQGJ2Muc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16BfEq8Qvc7y_UvNL_Jl4pNSs8RolDE0lSBE0kSa4q37k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16802-xq_nYZ8mMbx6wb5jMH9gLrNlJOzl_bKrKs1SBqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16nxXVLq-_LhX2N8-ggvlvGFb2CSQPdL0Jk31-3Yn8K8w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16S71Chi2csqavB37sGyuzVl_JPK7OXCWtUkcefaNr2-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI1Uh3YqlpEBhxKyjcdbTJSpHtEarJLrJfEIa0Mnxwsb5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 1024 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI19i-it_oJJnIjaw-zFrcLqZWij-txKmEuZwwuPirD8iw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 1024 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16t0-mPljiEZtupRhWAFpnxBDrJen-FsvDD34Q7PN3s_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16CAJPr2P6olRKJqA2Fmiymyoh6JhX7nYYGS7ffsGJvmw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16K_Na50sAdM91rhXevAlyiM9lrmobRF_PSYV6N76rQzk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16dVDH5F1pzH8NV-uymOAiVxprnNui5yf2Mp5t1_LqLTs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16Uv-d7icfFxrZ-YUiRaTBsX3xXbc3HcmSMlEdtwh8Yb0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI1TNTLRLqQ0uhz1zq5XPt3rxttGE1MT5QoFsMWFmpwZGg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI13HHr0kFgQAqJ2TjtZ8tTohe6Udz-5u0CwkMq-_9renQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI1KAql7WR7THsIAKFHzmlcHQVpFofe0Vzt2xFE0wiTOfU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI1-LGVgPAQHxTiHAOxIZA3A-LLqgvwFhsRG4LoerQvYqg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI1uwD3q8H7ROxJTYYpP8cvtJiz3DeoGvbO5IiJaZpCFnE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 147968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 15 + DestDataType: 15 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_F8F8S_BH_BiasSHB_HAS_SAB_SCD_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml new file mode 100644 index 000000000000..662cf21e7a43 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs.yaml @@ -0,0 +1,44324 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x256x128_MIrZs5mdCNmpikVNpV2YQQNFBCAdivVS8D1AsBXrhUP4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x224x128_MIADOZHpI0e4kz3PekZsq-nbyBZU7EZIxEg7KcvUC6g3g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MIszVB7i9sCthoswarzra0qLsMXdGUHQA9HKGzu-1z5Kk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MI4oZDINgtVU-f19tz2HYWh_LmdGFiZplMuFYBiLKIRVE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124928 + LdsInitCVgprs: false + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MIQveu_FRPQ7kB2acKv85tmm0lWhIAYF1e-bVgSb8HhRU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI1nNTR_ZcKmsj4A34uOMZTZX_oilYaQuaQFL2SNhs8cXo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI17E3Elp9R2iGRIjoSUJt7ENU0IkIEDCYD27G5goO_zfQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI14oL9i2wAQa7K6yEREmecBQB_z40_9_5z5qamIYTi9BA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38912 + LdsInitCVgprs: false + LdsNumBytes: 38912 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38912 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x256x128_MIVqWf-zHb4fGrqaI_PNfaMIuBSbf24WaOtDVKEmRwTy0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 166912 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 8] + MIWaveTileA: 7 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 256 + MacroTileA: 224 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 224 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 8 + ThreadTileA: 28 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x224x128_MI2FE1_xdi6gisBeCfrvgksfzzbx9Yx6-LxacJqFoorOs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 71680 + LdsInitCVgprs: false + LdsNumBytes: 71680 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 71680 + LdsOffsetMetadata_Blk: 166912 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 7] + MIWaveTileA: 7 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 224 + MacroTileA: 224 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 196 + NumGlobalWriteVectorsPerThread: 196 + NumLoadsA: 7 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 7 + ThreadTileA: 28 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x192x128_MItgCKUscKBBiMtBOmdwUt6qAHeM_MR7YbKWEkh39p8UQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 6] + MIWaveTileA: 7 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 192 + MacroTileA: 224 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 168 + NumLoadsA: 7 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 6 + ThreadTileA: 28 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x160x128_MI_UOU6HZrGnyaniPidkPyA2vc3eMgOLn6XHhdMyr3gok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 160 + MacroTileA: 224 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 7 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x160x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MIdhuZzZrmslv6fIdikHYnhn1dpXNXnkfYYXLJoFZzt6s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 128 + MacroTileA: 224 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI1b5RoEaxogDkty9pH0a4h4iqEK74VHsxBERA-H-NeUNY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 96 + MacroTileA: 224 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI1g3lxPhwHBbhj-5_vMz-ezv_Gct2IZefQGw3UsfUuGuM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI1pRHgvlAR7N2hth7GOj12UHkPW1O97Tnl14UAnpJSv9c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 7 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MICt7le9q16ngZmPo_ZNYkoWOjOg2jVjonbKgLeTEzLxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x224x128_MIFCEugnbAE1o_KNOxQTjmgk1-riRp7-bqr1XYLHQkDJI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129024 + LdsInitCVgprs: false + LdsNumBytes: 129024 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x192x128_MIsXAzSnuik23-u_Tqj9IA9yIVlTTHvUnKVAh5tdPfE-0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 120832 + LdsInitCVgprs: false + LdsNumBytes: 120832 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x192x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MIdawCsxOV8Q1yfZkDghTj5ROObRZEHHk5r8TLSJ8-4DI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI96PVzcjbGKsmGAPHOizb5QovZZPMnBXqYWXYWCMR6RE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI1XF9bGkyS2smPrKpZKRvXqZE_IlG80gFUg26F5mTZLis= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI1cyGRGs5fNOjzNULtz2NEdDFcpvgD-53w5wck9Mz0rEA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI1dXOVrMlvBwO6y7ESILkUB5p9GtB-Km9ykpVzg9YqhsY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 60416 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MI_FMKfIaIlw04mnTidFqT-Fnj0gDbPBKMLBy9AxGoipc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124928 + LdsInitCVgprs: false + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x224x128_MIgvcBHckHBXx7rYEXvd58PMUeCwZSGsDJtMgKSLGOngQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 7] + MIWaveTileA: 5 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 224 + MacroTileA: 160 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x224x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 7 + ThreadTileA: 20 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MIiC0cBFJQlkymU09JdEq7dUq1wvcSX9wghKOSQxkOErI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 6] + MIWaveTileA: 5 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 192 + MacroTileA: 160 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 6 + ThreadTileA: 20 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x160x128_MIwREMu9dXpHu0hNi1Te8Ryumj0HYpn1BFkusQoNfhAQ8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 5] + MIWaveTileA: 5 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 160 + MacroTileA: 160 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 100 + NumGlobalWriteVectorsPerThread: 100 + NumLoadsA: 5 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 5 + ThreadTileA: 20 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MIcrglZoxFbHHjwhvMHFjwAoFdYaccIqWMvEsbqy0yalI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI1BUsuevlFp3qE9qBkCpl8ZHtyZp2nTMvZX1nz1cQRT7M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 5 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI1IVM_kq6fQOztyL87pw0KKbmpiglUkaPXmejIGutUrh4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI1bgH1ukcWIV-QEjZUrEMgbOOF-CIy8Yu_mpQb3L9Zq3g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 58368 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 64, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 64 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 64, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x256x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: 0 + enableLDSTrB: 0 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MIHAxeqAl5GZyYpWyj9YBlHLiAot0B1CVx9Nf_5OHRC1Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MIIlrnP4n1WnSi3fe0xoxepRfTuuLMMp-jgbnYdGnnvYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI5bZ7yznefmkxpzzbB-XiEKlKRJu2AMeEgvXx60Tq5L0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI72iEPgzvyM2ZrWa9mJE7BjRqJtpIrgM81I_tGNEjK0c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI1xf_ukgQ-Z-qNS4jmkabhCAr3mB7WAQFBVo0jxB_s6wM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI1j8uVA7i_pGx8ZCBrMR8ZhK3U6bp7J7YjtbYnnKdPxDI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI1M6uR2wC2h6sr2Hl9YrF8zhWGQ7RgoVsQ-TM7lCE6pPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22528 + LdsInitCVgprs: false + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI1lw4esJJcKXjrN7F_gKTEASvr4FPioQKVPOZImphrvBY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 8] + MIWaveTileA: 3 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 8 + ThreadTileA: 12 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI1JixbiZckG_HlXVPjjWZHNInPUwLf2xyuU7BTFa5yHI0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 7] + MIWaveTileA: 3 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 224 + MacroTileA: 96 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 3 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 7 + ThreadTileA: 12 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI1wIqbeUYowBvH47eY0D90voHkVaoBg2lPdbX2d68nIJ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI1J-2M-3q4LSCntZjUh3v8nZH3qUcDC89rCJ0Fdq3iMeI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 80896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 80896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI1Vr1CG6o8iRf8BbhZJxSu0FVBXoX31LtlPuM8cs_YEXU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16Cnahg_B3KmoGxZkTO1OYZLBTHOrwHNY1VQd2t5IrylI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16W4yMmcRT_Rcp3ZNo2gSW3q4SJdd9i9DRRKcgqox59VE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16zNms4_iu6xgFQiq42WUbXrNnFNyslp04sbAKWAlTbog= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20480 + LdsInitCVgprs: false + LdsNumBytes: 20480 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI11aFcbk23grO4-hK8aBkA0Q2TJ_RzSnSefhCsshDFWE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI1BIQ9uHrC9E8FHJ9mzReo0log_q-mFe47Gvfw9lRTxKU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI1oWIDEj_bU2sIdW6WZSqKJI8Okx_mH0MwWvgKg4p0Wh0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI1DfS6ucItLGFYy3q0Zz-JZd7icG_ycHgazunawsT8Pns= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI1jK-MUAAqzxLFJYDxWtp_lRnOH6G_Sa3LS7gsDkzJ5ZI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16qBSTIiXAL1b2k0UyETAi0pu4GSfWwKXX5o-ybzBtncE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16mOGETtdY_W1srxs4Mfgd04W-cV79aCR6raMD3HA6jWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16wIclDOa00a_BcP3sOUDpDc_qFY9RvbDE777DV1a9O7k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI1pg1aJvOuqnmTxe_-2e9_1S4uFGoAaPhgR-zs28ui024= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38912 + LdsInitCVgprs: false + LdsNumBytes: 38912 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 38912 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI1bJJ5iziycwP81Vg_rFbnRobq-WE2Gw-XuD7kvGV58LU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 1 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI1c60kFRwy-kwCbpVB3Be-Tp-X0igfgGU9wMtdCOAL5xM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 1 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI13_eugo-sWMKarMTBe1WI4hmzJdiGvToykcHBbj4WXow= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 1 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI1gSIhM4a60TQQmpGca3zPH_1C9YMOrBSb2Bu9JrxvD-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22528 + LdsInitCVgprs: false + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16YH7g6N9l5w5imlP3Lp9mdDXwtO_ZtzNO9yUXMFVOjKQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 20480 + LdsInitCVgprs: false + LdsNumBytes: 20480 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 1 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16IFZMDGDs3-q6koIU2xIB_XUAfzhmng9Mg_46GO3Rqf4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 + LdsInitCVgprs: false + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16rLKaVqye5ZVlglTUO9ioHrZ6WB-GJ_Qs2RqP4S8ZIXE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x240x128_MIyoGz0G9DWmdNz6WjS2Bq6rCsCrR_mXM4td8eMkMYEDw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x240x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73216 + LdsInitCVgprs: false + LdsNumBytes: 73216 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 38400 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73216 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 15] + MIWaveTileA: 4 + MIWaveTileB: 15 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 240 + MacroTileA: 256 + MacroTileB: 240 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 8 + NumLoadsB: 15 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 15 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x240x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 15 + ThreadTileA: 16 + ThreadTileB: 15 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MIBnc7kWhy6AvjwbNc5adjziCfJiLAVAD1RzoNcS6OMZQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 68096 + LdsInitCVgprs: false + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 13] + MIWaveTileA: 4 + MIWaveTileB: 13 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 208 + MacroTileA: 256 + MacroTileB: 208 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 52 + NumLoadsA: 8 + NumLoadsB: 13 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 13 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x208x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 13 + ThreadTileA: 16 + ThreadTileB: 13 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MIF5A-1D1ZIPjJsJJ7ttt4AlfEFPQG0riur7Zv_qXG5YI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128512 + LdsInitCVgprs: false + LdsNumBytes: 128512 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 28160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 11] + MIWaveTileA: 4 + MIWaveTileB: 11 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 176 + MacroTileA: 256 + MacroTileB: 176 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 44 + NumLoadsA: 8 + NumLoadsB: 11 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 11 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x176x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 11 + ThreadTileA: 16 + ThreadTileB: 11 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MIyRCs1CYgk6qoLPVzZz2698DWR4wMJ7whNVJ8JwJfnZc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x144x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MIaVDafPaNDXaLtu56b1YkZsQUI1aLmxb3jgdu_dv65EA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x112x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI1jNVYmoC6fjSjbIADTf0f9l0Cs4TKifqjd6Xql0ehllk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 + LdsInitCVgprs: false + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI1eX_SljyDEcS5rCPMhk2caJwb3hhe7qqzmb_HRJ0i-U0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI1fPZ5I9MI2WZKscvBb8L9C1UGdH6oze92UNxsd_rmjZ0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT240x256x128_MI17whl_3k4Y1EhiqZzn7y-o-UTqlat-Sd3AsiBr6V3nc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT240x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73216 + LdsInitCVgprs: false + LdsNumBytes: 73216 + LdsNumElementsAlignedA: 38400 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 38400 + LdsOffsetB_Blk: 169472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73216 + LdsOffsetMetadata_Blk: 169472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [15, 4] + MIWaveTileA: 15 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 240 + MacroTile1: 256 + MacroTileA: 240 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 240 + NumLoadsA: 15 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 15 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT240x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 60 + ThreadTile1: 4 + ThreadTileA: 60 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MIjK835T0h-3Ar1N2b4kmmt65KtYLTJhfaZH3LmR2auFA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 68096 + LdsInitCVgprs: false + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 4] + MIWaveTileA: 13 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 208 + MacroTile1: 256 + MacroTileA: 208 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 208 + NumGlobalWriteVectorsPerThread: 208 + NumLoadsA: 13 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 13 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT208x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 52 + ThreadTile1: 4 + ThreadTileA: 52 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MIgsJynaoGp5Lr3OC9AUIiNlgAW9kkq7qPUmgK4ohOA_k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128512 + LdsInitCVgprs: false + LdsNumBytes: 128512 + LdsNumElementsAlignedA: 28160 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 28160 + LdsOffsetB_Blk: 93696 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 28160 + LdsOffsetMetadata_Blk: 93696 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [11, 4] + MIWaveTileA: 11 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 176 + MacroTile1: 256 + MacroTileA: 176 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 176 + NumGlobalWriteVectorsPerThread: 176 + NumLoadsA: 11 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 11 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT176x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 44 + ThreadTile1: 4 + ThreadTileA: 44 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MI6ZUiH-qsm4LTRuwyb7uw6oTXUaPI3Vjpt7WHaUbBOfI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 4] + MIWaveTileA: 9 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 144 + MacroTile1: 256 + MacroTileA: 144 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT144x256x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 36 + ThreadTile1: 4 + ThreadTileA: 36 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MIiu6Dap7c-mOezayg2ePWEWJWIUNWwq_GUAWuc6asW8k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 52736 + LdsInitCVgprs: false + LdsNumBytes: 52736 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 52736 + LdsOffsetMetadata_Blk: 83456 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 256 + MacroTileA: 112 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT112x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI1fyJI8fnAEv3sOPHcVPqOC2AU9kSqOftCPxastr6HZJE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 + LdsInitCVgprs: false + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI1-Zo3r6x3T98ZMWTZ4H7LqLtk7xXmH_6zOjhPN09FdTk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 + LdsInitCVgprs: false + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI1wehOWUghk2A68IMMHO37hmuhc-kFrA6X9yfb9WRqhBg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 68096 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI1vCzuwEprjPexlyglCW3zRXsjX9xUmGeZzokhH60Z9yE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 75776 + LdsInitCVgprs: false + LdsNumBytes: 75776 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 75776 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 1] + MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x256_MI1SiLGOArtDjYGdvuIvIxglRBR2PASF7znHipyq968J9k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 195584 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 195584 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x256_MI1Atkrqghs5hZ7rFrRDMn1qRHk9P7RjRIX2jTHJaCZcIs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 64512 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 64512 + LdsOffsetB_Blk: 195584 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 195584 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT224x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x256_MI1H6kBMzYhDsiXY1vBEU2Ud8UhLYehCYuWeQaCulrUTHo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI153vNbJPjGrVniCPDOzS5UGzSK6NOLhTT3dYrLoo1YJ4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI1y0BaulKQmTtKr2XSp5P3dnaOK0k5UYUxfkhT1HxC_rU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 117760 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 117760 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 1] + MIWaveTileA: 6 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 32 + MacroTileA: 192 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT192x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 1 + ThreadTileA: 24 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x256_MIUYoy0JGLYkrhIavz9gjog-Elk__Z0jYq-QXQEcN2PaA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x256_MI1cUVPZBoZ-mtmE8sTWF_Az_FU-gB_6I8zhwfiN1pEaks= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 96 + MacroTileA: 160 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI1EM3GcfaRjwnLLnXwjtnf8XZHB699buVut-TXk4ZjWyc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI15N7cdBjXALKtpaYkT17ouFsMqBdqYm1rqUc2cC7GeLE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 111616 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 32 + MacroTileA: 160 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT160x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x256_MIU14cbwZ_t9NC-mBYr7N5MljJmeoGwBb-WUmjyy7ILhU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MIM08mmLtqT14x3TxGefabTcO290EciaeH6GnjDeU8Bgo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI1OG3k0SIeaxxBYmDDXCQJ0v824MXDB2cvmAH14Io8mlM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI1Ee_Jq2fBrVEHXaPp-qikzlaFXpjSm2wKDPvD-MKxLM0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI13IeiqjCNSTMel5JBLjnHPiB9PE_JYWIIfXqePFminMc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x256_MI1qUn-crPudL6dXZ6rCcKnkE9s4w-RcuNNqOhYdbIrR28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x256_MI18wp9zZc9oencMlWPaDY7BUc_iQgMxwF-RQOzHet7J3I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 158720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 158720 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI1LrtbILHycv46qRt6NxJYDFm4zT_v-o9V1TAKrHNeAeQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x256_MI164Fx4NhdfXwLWoN6X-Afen5a5uy0a8LeKZgpjvani3CM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16_Nc2lY4fwUC-cbMivJ1U_eNf_9vhMW0WYu6jNMKBzYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16X2E4Wx9NVLCyWp4uy3bGT7QFxR20eVc2o2HQJqwi8Hk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x256_MI1ehl3l0UKCFUW0CqlYWmD5Gv6DpebCzV_7KPnbR_5y4Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 224 + MacroTileA: 64 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 7 + ThreadTileA: 8 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI1PpwSxkO--tNKZnLh5u1Zqztz5GRs2qwDUtmWoQ32_0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI1BIcxM2OYKG0Jqn5GRpKyHvCNhj2aQceoHMvTm7lJo6Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 + LdsInitCVgprs: false + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI1-eL7HedP-uoMH-4uLF-vatbGDwtmp3GZhw8IuLzDHvE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16Tidk1Ou4UA72T_vnHnWAsymx18z-7WG5S7yV6EMzxAE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 + LdsInitCVgprs: false + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16P7nrnDgCqsPXJVFvUTHGSoy-i5STapWZDg4F8KHN2a8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI164FNCErQfeGGweRYwOlRLTlQQw-uihsMini7VG5MjqUs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI1Weg33rA_lR82ctIL5Wv-agG74LvDUGtagD8ZNXCm6m8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 75776 + LdsInitCVgprs: false + LdsNumBytes: 75776 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 75776 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 8] + MIWaveTileA: 1 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x256_MI1yjWOfHPT7EqX9rhhXejY8_eiJ07brJZxCVBlCXR9DBo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 140288 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 224 + MacroTileA: 32 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 2 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x224x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI1pVP6pjfyPWPVeerMg8FY1EyiGjH4B_j7tc4lRUzRPvg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 + LdsInitCVgprs: false + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 6] + MIWaveTileA: 1 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x192x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI1q7mTED5UvVJrOPDvYFNPflDjUkX1okY5IJT3AZBddMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 160 + MacroTileA: 32 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x160x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI1fFpYWYjPWsmrBYgoz9CIhz7OHVoysPvt9SgTel6GPWI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16mu07e3PYZ_jp2LQxX0lmvu4pmsw_265fxGpIWycVSjg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 36864 + LdsInitCVgprs: false + LdsNumBytes: 36864 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 36864 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI162x9lNzvJGF2jhOPNNRp1Q3mFaUkUkcNWlM7ecLmXXfs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16-01AyvDsXfpTy-wvoK3K25-ZRKuFeCLIRsj5lVXzX_s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x256_MI1LDs52gy7NI4WuOMQ_idbNXBwBK86orsq1YVyOo4IvKs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 80 + MacroTileA: 256 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 16 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x80x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI1NztuuZzeaShCMYTqz64dRhvSvIEetpxjBMHS12EujLs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81408 + LdsInitCVgprs: false + LdsNumBytes: 81408 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81408 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 48 + MacroTileA: 256 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI1-eS3GREuZbaF_WcdTrJ8tJN1VRBcjitHDgD1jEhWCcE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72192 + LdsInitCVgprs: false + LdsNumBytes: 72192 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72192 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT256x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x256_MI1ZLUtUYYiOHV_3Bx1CweJ56xkOj29nbS2TUKPiIsgWeM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT80x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI1QlOH2GDodFrn6A0rxCKHrFBaU_QWfv580nm9xWdZY0k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81408 + LdsInitCVgprs: false + LdsNumBytes: 81408 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81408 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT48x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI1PZ1TyY_JO23lBPKX9VhACTOX37NNgYiBjf2A-tanW7w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72192 + LdsInitCVgprs: false + LdsNumBytes: 72192 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72192 + LdsOffsetMetadata_Blk: 135680 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 256 + MacroTileA: 16 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x256x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI16Z6LYIKVcBKQJUXMfCTkH2UmyhUWm90vrVgFH5F5h_is= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI165IfRyrGVaZ277jt_y40d11CoSr5lg3fA8j83jyEgno8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 52224 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 52224 + LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 183296 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 32 + MacroTileA: 96 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 12 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT96x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16Gx0c38MGhJUAjADEf6sgUUCYEnSxT0HvEevn-TLzDBA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16F-32foTbZU7_IZ8fkLWy78MMRaTz5VOyQ3YshyDK2WM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16-AdnHWdGaF-mVZCwCYUd2wq3b2fSNewc3-7rmRp2b3Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16pBqfsRIAkQBDjOQRcdakPvu1rZh51vJuBh1AVswNR5Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16tO84-u8CmDozhzOYFPjZx1aAnLl_RdoGaIlGdosGUzM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16WaJJEhK9J396-Hl1mrtJrwp0VdJSLCW5ohgvQGJ2Muc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 69632 + LdsInitCVgprs: false + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x96x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16BfEq8Qvc7y_UvNL_Jl4pNSs8RolDE0lSBE0kSa4q37k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16802-xq_nYZ8mMbx6wb5jMH9gLrNlJOzl_bKrKs1SBqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 + LdsInitCVgprs: false + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16nxXVLq-_LhX2N8-ggvlvGFb2CSQPdL0Jk31-3Yn8K8w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16S71Chi2csqavB37sGyuzVl_JPK7OXCWtUkcefaNr2-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 512 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI1Uh3YqlpEBhxKyjcdbTJSpHtEarJLrJfEIa0Mnxwsb5k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 1024 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI19i-it_oJJnIjaw-zFrcLqZWij-txKmEuZwwuPirD8iw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 1024 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 2 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR2_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16t0-mPljiEZtupRhWAFpnxBDrJen-FsvDD34Q7PN3s_4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16CAJPr2P6olRKJqA2Fmiymyoh6JhX7nYYGS7ffsGJvmw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16K_Na50sAdM91rhXevAlyiM9lrmobRF_PSYV6N76rQzk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16dVDH5F1pzH8NV-uymOAiVxprnNui5yf2Mp5t1_LqLTs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16Uv-d7icfFxrZ-YUiRaTBsX3xXbc3HcmSMlEdtwh8Yb0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI1TNTLRLqQ0uhz1zq5XPt3rxttGE1MT5QoFsMWFmpwZGg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI13HHr0kFgQAqJ2TjtZ8tTohe6Udz-5u0CwkMq-_9renQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT32x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI1KAql7WR7THsIAKFHzmlcHQVpFofe0Vzt2xFE0wiTOfU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 67584 + LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 198656 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT64x16x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI1-LGVgPAQHxTiHAOxIZA3A-LLqgvwFhsRG4LoerQvYqg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50688 + LdsInitCVgprs: false + LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x32x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI1uwD3q8H7ROxJTYYpP8cvtJiz3DeoGvbO5IiJaZpCFnE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: true + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 1024 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 16 + GlobalReadVectorWidthB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 1024 + LSCB: 1024 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 84480 + LdsInitCVgprs: false + LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 147968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 32 + LdsPadB: 32 + LdsPadMetadata: 0 + LocalReadVectorWidth: 16 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ProblemType: + Activation: 1 + ActivationComputeDataType: 0 + ActivationFuncCall: 1 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: 1 + BiasDataTypeList: [0, 4, 7] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 15 + DataTypeA: 15 + DataTypeAmaxD: 0 + DataTypeB: 15 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: Scalar + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_F8SS_BH_BiasSHB_HAS_SAB_SAV_UserArgs_MT16x64x1024_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA16_GRVWB16_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA32_LPB32_LPM0_LRVW16_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 1024 + _DepthUA: 1024 + _DepthUB: 1024 + _DepthUMetadata: 1024 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- null +- null +- null +- null +- DeviceEfficiency +- Prediction diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/Signature.py b/projects/hipblaslt/tensilelite/Tensile/Components/Signature.py index d02bff803072..2f1bd9cfd02d 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/Signature.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/Signature.py @@ -207,21 +207,14 @@ def __call__(self, writer) -> SignatureBase: if kernel["StreamK"]: # StreamK args - signature.addArg("MagicNumberProblemNumGroupTiles0", SVK.SIG_VALUE, "u32") - signature.addArg("MagicShiftProblemNumGroupTiles0", SVK.SIG_VALUE, "u32") signature.addArg("ItersPerTile", SVK.SIG_VALUE, "u32") - signature.addArg("MagicNumberItersPerTile", SVK.SIG_VALUE, "u32") - signature.addArg("MagicShiftItersPerTile", SVK.SIG_VALUE, "u32") - signature.addArg("MagicNumProblemNumGroupTiles0By1", SVK.SIG_VALUE, "u32") - signature.addArg("MagicShiftProblemNumGroupTiles0By1", SVK.SIG_VALUE, "u32") signature.addArg("TotalIters", SVK.SIG_VALUE, "u32") signature.addArg("SKItersPerWG", SVK.SIG_VALUE, "u32") - userArgumentsInfo.gemmArgumentSize += 36 + userArgumentsInfo.gemmArgumentSize += 12 if kernel["StreamK"] >= 2: # Two-tile SK - signature.addArg("skGrid", SVK.SIG_VALUE, "u32") - signature.addArg("skTiles", SVK.SIG_VALUE, "u32") + signature.addArg("skGridAndTiles", SVK.SIG_VALUE, "u32") signature.addArg("skExtraIters", SVK.SIG_VALUE, "u32") - userArgumentsInfo.gemmArgumentSize += 12 + userArgumentsInfo.gemmArgumentSize += 8 # "dpTilesPerWG" if kernel["ProblemType"]["UseScaleAB"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 231968c11f9d..88e68797b40b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -29,7 +29,9 @@ SMinU32, SMovB32, SMovB64, SMulI32, SNop, SSleep, SStoreB32, SSubU32, \ SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \ VReadfirstlaneB32, VCvtBF16toFP32 -from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, vectorStaticMultiply, BranchIfNotZero +from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \ + vectorStaticMultiply, BranchIfNotZero, scalarUInt32DivideAndRemainder + from ..Common import print2, ceilDivide, log2 from ..Component import Component @@ -61,6 +63,7 @@ def __call__(self, writer, kernel): module = Module("XCCMapping On") with writer.allocTmpSgpr(4) as tmpSgprRes: + skGrid = tmpSgprRes.idx sXCC = tmpSgprRes.idx sGridC = tmpSgprRes.idx + 1 sGridF = tmpSgprRes.idx + 2 @@ -74,11 +77,12 @@ def __call__(self, writer, kernel): sTmpRes = ContinuousRegister(idx=sTmp, size=2) # sGridC = ceil(grid / xccm) - module.add(SAddU32(dst=sgpr(sGridC), src0=sgpr("skGrid"), src1=hex(kernel["StreamKXCCMapping"] - 1), comment="ceil(grid/xccm)")) + module.add(SLShiftRightB32(dst=sgpr(skGrid), shiftHex=hex(16), src=sgpr("skGridAndTiles"), comment="Get skGrid")) + module.add(SAddU32(dst=sgpr(sGridC), src0=sgpr(skGrid), src1=hex(kernel["StreamKXCCMapping"] - 1), comment="ceil(grid/xccm)")) module.add(scalarStaticDivideAndRemainder(qReg=sGridC, rReg=-1, dReg=sGridC, divisor=kernel["StreamKXCCMapping"], tmpSgprRes=sTmpRes, doRemainder=0)) # sGridF = floor(grid / xccm) # sGridM = grid % xccm - module.add(scalarStaticDivideAndRemainder(qReg=sGridF, rReg=sGridM, dReg="skGrid", divisor=kernel["StreamKXCCMapping"], tmpSgprRes=sTmpRes)) + module.add(scalarStaticDivideAndRemainder(qReg=sGridF, rReg=sGridM, dReg=skGrid, divisor=kernel["StreamKXCCMapping"], tmpSgprRes=sTmpRes)) # sXCC = wg0 % xccm # sqtmp is temp register for quotient for non-power-of-2 case # sqtmp overlaps temp registers, works in this case and output is discarded @@ -126,7 +130,11 @@ def skTileIndex(self, writer, kernel, sTmp, tPA, tPB): module.addComment0("StreamK calculate tile idx and map to WG") # sTmp = tile index - module.add(sMagicDiv2(sgpr(sTmp), sgpr(sTmp+1), sgpr("StreamKIter"), sgpr("MagicNumberItersPerTile"), sgpr("MagicShiftItersPerTile"), sgpr(sTmp+2))) + tmpVgpr = writer.vgprPool.checkOut(2, "div") + tmpVgprRes = ContinuousRegister(idx=tmpVgpr, size=2) + module.add(scalarUInt32DivideAndRemainder(qReg=sTmp, dReg="StreamKIter", divReg="ItersPerTile", rReg=-1, tmpVgprRes=tmpVgprRes, wavewidth=kernel["WavefrontSize"], doRemainder=False, comment="StreamKIter // ItersPerTile")) + tmpVgprRes = None + writer.vgprPool.checkIn(tmpVgpr) # sTmp+1 = tile start module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr(sTmp), src1=sgpr("ItersPerTile"), comment="Tile start iteration")) # sTmp+2 = tile end @@ -140,19 +148,18 @@ def skTileIndex(self, writer, kernel, sTmp, tPA, tPB): return module def skIndexToWG(self, writer, kernel, sTmp): + # Note: There's one unused sgpr passed with sTmp. module = Module("StreamK skIndexToWG") # Map StreamK tile index to wg0/1 module.addComment0("Map StreamK tile index to wg0/1/2") - module.add(sMagicDiv2(sgpr(sTmp+1), sgpr(sTmp+2), sgpr(sTmp), sgpr("MagicNumProblemNumGroupTiles0By1"), sgpr("MagicShiftProblemNumGroupTiles0By1"), sgpr(sTmp+3))) - module.add(SMovB32(dst=sgpr("WorkGroup2"), src=sgpr(sTmp+1), comment="wg2 = Tile Idx / problemNumGroupTiles0By1")) - module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("NumWorkGroups0"), comment="remainder part 1 : quotient * divisor")) - module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("NumWorkGroups1"), comment="remainder part 1 : quotient * divisor")) - module.add(SSubU32(dst=sgpr(sTmp), src0=sgpr(sTmp), src1=sgpr(sTmp+1), comment="remainder")) - module.add(sMagicDiv2(sgpr(sTmp+1), sgpr(sTmp+2), sgpr(sTmp), sgpr("MagicNumberProblemNumGroupTiles0"), sgpr("MagicShiftProblemNumGroupTiles0"), sgpr(sTmp+3))) - module.add(SMovB32(dst=sgpr("WorkGroup1"), src=sgpr(sTmp+1), comment="wg1 = Tile Idx / problemNumGroupTiles0")) - module.add(SMulI32(dst=sgpr("WorkGroup0"), src0=sgpr(sTmp+1), src1=sgpr("NumWorkGroups0"), comment="remainder part 1 : quotient * divisor")) - module.add(SSubU32(dst=sgpr("WorkGroup0"), src0=sgpr(sTmp), src1=sgpr("WorkGroup0"), comment="wg0 = Tile Idx % problemNumGroupTiles0")) + module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr("NumWorkGroups0"), src1=sgpr("NumWorkGroups1"), comment="Total tiles")) + tmpVgpr = writer.vgprPool.checkOut(2, "div") + tmpVgprRes = ContinuousRegister(idx=tmpVgpr, size=2) + module.add(scalarUInt32DivideAndRemainder(qReg="WorkGroup2", dReg=sTmp, divReg=sTmp+1, rReg=sTmp+2, tmpVgprRes=tmpVgprRes, wavewidth=kernel["WavefrontSize"], doRemainder=True, comment="TileID // nWG0*nWG1")) + module.add(scalarUInt32DivideAndRemainder(qReg="WorkGroup1", dReg=sTmp+2, divReg="NumWorkGroups0", rReg="WorkGroup0", tmpVgprRes=tmpVgprRes, wavewidth=kernel["WavefrontSize"], doRemainder=True, comment="TileID // nWG0")) + tmpVgprRes = None + writer.vgprPool.checkIn(tmpVgpr) module.addSpaceLine() return module @@ -299,9 +306,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(SAddU32(dst=sgpr(sCtaIdx), src0=sgpr("StreamKIdx"), src1=1, comment="input partial tile index")) sFixupEnd = writer.sgprPool.checkOut(1, "FixupEnd", preventOverflow=False) # self.defineSgpr("CtaEnd", 1) - module.add(sMagicDiv2(sgpr(tmpSgpr), sgpr(tmpSgpr+1), sgpr("StreamKIterEnd"), sgpr("MagicNumberItersPerTile"), sgpr("MagicShiftItersPerTile"), sgpr(tmpSgpr+2))) - module.add(SMulI32(dst=sgpr(tmpSgpr), src0=sgpr(tmpSgpr), src1=sgpr("ItersPerTile"), comment="start iteration of partial tile")) - module.add(SSubU32(dst=sgpr(sFixupEnd), src0=sgpr("StreamKIterEnd"), src1=sgpr(tmpSgpr), comment="calc iterations completed by this WG")) + tmpVgpr = writer.vgprPool.checkOut(2, "div") + tmpVgprRes = ContinuousRegister(idx=tmpVgpr, size=2) + module.add(scalarUInt32DivideAndRemainder(qReg=tmpSgpr, dReg="StreamKIterEnd", divReg="ItersPerTile", rReg=sFixupEnd, tmpVgprRes=tmpVgprRes, wavewidth=kernel["WavefrontSize"], doRemainder=True, comment="StreamKIterEnd // ItersPerTile")) + tmpVgprRes = None + writer.vgprPool.checkIn(tmpVgpr) module.add(skFixupLabel) @@ -1662,7 +1671,8 @@ def preLoop(self, writer, kernel): # clamp to end of sk iterations # TODO maybe remove clamp, since extra iters code should guarantee total iterations match sTmp = writer.sgprPool.checkOut(1, "TotalSKIters", preventOverflow=False) - module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("skTiles"), src1=sgpr("ItersPerTile"), comment="Total SK iters")) + module.add(SAndB32(dst=sgpr(sTmp), src0=sgpr("skGridAndTiles"), src1=hex(65535), comment="Get skTiles")) + module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr(sTmp), src1=sgpr("ItersPerTile"), comment="Total SK iters")) module.add(SMinU32(dst=sgpr("StreamKIterEnd"), src0=sgpr("StreamKIterEnd"), src1=sgpr(sTmp), comment="Cap ending iter at total SK iters")) writer.sgprPool.checkIn(sTmp) # check if this WG has no work to do @@ -1689,11 +1699,13 @@ def graWorkGroup(self, writer, kernel, tPA, tPB): # Increment StreamK iteration # If moving from SK to DP, next iteration is first DP # sTmp = offset to first DP tile - module.add(SMulI32(dst=sgpr(sTmp+3), src0=sgpr("skTiles"), src1=sgpr("ItersPerTile"), comment="Offset to first DP tile")) + module.add(SAndB32(dst=sgpr(sTmp+3), src0=sgpr("skGridAndTiles"), src1=hex(65535), comment="Get skTiles")) + module.add(SMulI32(dst=sgpr(sTmp+3), src0=sgpr(sTmp+3), src1=sgpr("ItersPerTile"), comment="Offset to first DP tile")) module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr("StreamKIdx"), src1=sgpr("ItersPerTile"), comment="WG tile offset")) module.add(SAddU32(dst=sgpr(sTmp+3), src0=sgpr(sTmp+3), src1=sgpr(sTmp+1), comment="DP start offset + WG offset")) # If already in DP, add dpShift - module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr("skGrid"), src1=sgpr("ItersPerTile"), comment="DP iterations shift")) + module.add(SLShiftRightB32(dst=sgpr(sTmp+1), shiftHex=hex(16), src=sgpr("skGridAndTiles"), comment="Get skGrid")) + module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("ItersPerTile"), comment="DP iterations shift")) module.add(SAddU32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("StreamKIter"), comment="Add DP shift")) # Save DP iter in sTmp module.add(SCmpLtU32(src0=sgpr("StreamKIter"), src1=sgpr("StreamKIterEnd"), comment="Check if in SK or DP section")) @@ -1760,7 +1772,8 @@ def preLoop(self, writer, kernel): module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr("StreamKIdx"), src1=sgpr("ItersPerTile"), comment="DP starting iteration (case: DP work to do)")) module.add(SMovB32(dst=sgpr("StreamKIterEnd"), src=sgpr("TotalIters"), comment="DP ending iteration (case: only DP work to do)")) sTmp = writer.sgprPool.checkOut(1, "TotalSKIters", preventOverflow=False) - module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("skTiles"), src1=sgpr("ItersPerTile"), comment="Total SK iters")) + module.add(SAndB32(dst=sgpr(sTmp), src0=sgpr("skGridAndTiles"), src1=hex(65535), comment="Get skTiles")) + module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr(sTmp), src1=sgpr("ItersPerTile"), comment="Total SK iters")) module.add(SCmpLtU32(src0=sgpr(sTmp), src1=sgpr("TotalIters"), comment="Check if there are DP tiles to do")) module.add(SCBranchSCC1(labelName=skInitDone.getLabelName(), comment="Done init")) writer.sgprPool.checkIn(sTmp) @@ -1784,7 +1797,8 @@ def preLoop(self, writer, kernel): # clamp to end of sk iterations # TODO maybe remove clamp, since extra iters code should guarantee total iterations match sTmp = writer.sgprPool.checkOut(1, "TotalSKIters", preventOverflow=False) - module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("skTiles"), src1=sgpr("ItersPerTile"), comment="Total SK iters")) + module.add(SAndB32(dst=sgpr(sTmp), src0=sgpr("skGridAndTiles"), src1=hex(65535), comment="Get skTiles")) + module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr(sTmp), src1=sgpr("ItersPerTile"), comment="Total SK iters")) module.add(SMinU32(dst=sgpr("StreamKIterEnd"), src0=sgpr("StreamKIterEnd"), src1=sgpr(sTmp), comment="Cap ending iter at total SK iters")) writer.sgprPool.checkIn(sTmp) @@ -1805,10 +1819,12 @@ def graWorkGroup(self, writer, kernel, tPA, tPB): skUpdateDone = Label("SK_UpdateDone", "") # sTmp+3 = Offset to first SK tile - module.add(SMulI32(dst=sgpr(sTmp+3), src0=sgpr("skTiles"), src1=sgpr("ItersPerTile"), comment="Total SK iters")) + module.add(SAndB32(dst=sgpr(sTmp+3), src0=sgpr("skGridAndTiles"), src1=hex(65535), comment="Get skTiles")) + module.add(SMulI32(dst=sgpr(sTmp+3), src0=sgpr(sTmp+3), src1=sgpr("ItersPerTile"), comment="Total SK iters")) module.add(SSubU32(dst=sgpr(sTmp+3), src0=sgpr("TotalIters"), src1=sgpr(sTmp+3), comment="Offset to first SK tile")) # If in DP, add dpShift - module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr("skGrid"), src1=sgpr("ItersPerTile"), comment="DP iterations shift")) + module.add(SLShiftRightB32(dst=sgpr(sTmp+1), shiftHex=hex(16), src=sgpr("skGridAndTiles"), comment="Get skGrid")) + module.add(SMulI32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("ItersPerTile"), comment="DP iterations shift")) module.add(SAddU32(dst=sgpr(sTmp+1), src0=sgpr(sTmp+1), src1=sgpr("StreamKIter"), comment="Add DP shift")) # if sTmp+1 < sTmp+3, continue DP (add dpShift) module.add(SCmpLtU32(src0=sgpr(sTmp+1), src1=sgpr(sTmp+3), comment="Check if still in DP section")) diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s index 2b1223f00627..cb37d619c170 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s @@ -30,7 +30,7 @@ .text /* Num VGPR =249 */ /* Num AccVGPR=256 */ -/* Num SGPR =111 */ +/* Num SGPR =105 */ /******************************************/ /* Optimizations and Config: */ @@ -229,106 +229,71 @@ amdhsa.kernels: .offset: 116 .value_kind: by_value .value_type: f32 - - .name: MagicNumberProblemNumGroupTiles0 + - .name: ItersPerTile .size: 4 .offset: 120 .value_kind: by_value .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0 + - .name: TotalIters .size: 4 .offset: 124 .value_kind: by_value .value_type: u32 - - .name: ItersPerTile + - .name: SKItersPerWG .size: 4 .offset: 128 .value_kind: by_value .value_type: u32 - - .name: MagicNumberItersPerTile + - .name: skGridAndTiles .size: 4 .offset: 132 .value_kind: by_value .value_type: u32 - - .name: MagicShiftItersPerTile - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumProblemNumGroupTiles0By1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0By1 - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - - .name: TotalIters - .size: 4 - .offset: 148 - .value_kind: by_value - .value_type: u32 - - .name: SKItersPerWG - .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: skGrid - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: skTiles - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - .name: skExtraIters .size: 4 - .offset: 164 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: AddressScaleAlphaVec .size: 8 - .offset: 168 + .offset: 140 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: bias .size: 8 - .offset: 176 + .offset: 148 .value_kind: global_buffer .value_type: void .address_space: generic - .name: biasType .size: 4 - .offset: 184 + .offset: 156 .value_kind: by_value .value_type: u32 - .name: StrideBias .size: 4 - .offset: 188 + .offset: 160 .value_kind: by_value .value_type: u32 - .name: activationAlpha .size: 4 - .offset: 192 + .offset: 164 .value_kind: by_value .value_type: f32 - .name: activationBeta .size: 4 - .offset: 196 + .offset: 168 .value_kind: by_value .value_type: f32 - .name: activationType .size: 4 - .offset: 200 + .offset: 172 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 133120 .kernarg_segment_align: 8 - .kernarg_segment_size: 208 + .kernarg_segment_size: 176 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 102 @@ -339,7 +304,6 @@ amdhsa.kernels: ... .end_amdgpu_metadata Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: -label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA @@ -402,28 +366,21 @@ label_ASM_Start: /// Main body of the asm kernel .set sgprStridesB, 42 .set sgprAlpha, 44 .set sgprBeta, 45 -.set sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgprItersPerTile, 48 -.set sgprMagicNumberItersPerTile, 49 -.set sgprMagicShiftItersPerTile, 50 -.set sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgprMagicShiftProblemNumGroupTiles0By1, 52 -.set sgprTotalIters, 53 -.set sgprSKItersPerWG, 54 -.set sgprskGrid, 55 -.set sgprskTiles, 56 -.set sgprskExtraIters, 57 -.set sgprLocalWriteAddrA, 58 -.set sgprLocalWriteAddrB, 59 -.set sgprSwapA, 60 -.set sgprSwapB, 61 -.set sgprStreamKIdx, 62 -.set sgprStreamKIter, 63 -.set sgprStreamKIterEnd, 64 -.set sgprStreamKLocalStart, 65 -.set sgprStreamKLocalEnd, 66 -.set sgprSrdWS, 68 +.set sgprItersPerTile, 46 +.set sgprTotalIters, 47 +.set sgprSKItersPerWG, 48 +.set sgprskGridAndTiles, 49 +.set sgprskExtraIters, 50 +.set sgprLocalWriteAddrA, 51 +.set sgprLocalWriteAddrB, 52 +.set sgprSwapA, 53 +.set sgprSwapB, 54 +.set sgprStreamKIdx, 55 +.set sgprStreamKIter, 56 +.set sgprStreamKIterEnd, 57 +.set sgprStreamKLocalStart, 58 +.set sgprStreamKLocalEnd, 59 +.set sgprSrdWS, 60 /* Size Assignments */ .set sgprSizeI, sgprSizesFree+0 @@ -504,29 +461,30 @@ label_ASM_Start: /// Main body of the asm kernel /******************************************/ /* Load num of Gemms */ -s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 +s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 /* Load packed kernel args (StaggerU/GSU) */ -s_load_dword s73, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 +s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 /* Load WGM data */ s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 /* Load num of WGs */ -s_load_dword s74, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 s_waitcnt lgkmcnt(0) // load args -s_lshr_b32 s72, s67, 0x1e // Get arg type -s_and_b32 s67, 0x3fffffff, s67 // Get nums of gemm -s_cmp_eq_u32 s72, 0 // Is kernel args +s_lshr_b32 s65, s64, 0x1e // Get arg type +s_and_b32 s64, 0x3fffffff, s64 // Get nums of gemm +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 -s_load_dwordx16 s[36:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_waitcnt lgkmcnt(0) // preload s_branch label_LoadArgsEnd label_HBMArgs: @@ -537,9 +495,7 @@ s_waitcnt lgkmcnt(0) // wait for args to load label_LoadArgsEnd: s_branch label_common_kernel_entry -/* pad 35 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ -s_nop 0 -s_nop 0 +/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ s_nop 0 s_nop 0 s_nop 0 @@ -574,10 +530,10 @@ s_nop 0 s_nop 0 s_nop 0 label_Preload_Offset_Start: -s_and_b32 s67, 0x3fffffff, s2 // Get nums of gemm -s_lshr_b32 s72, s2, 0x1e // Get arg type -s_mov_b32 s73, s3 // Preload internal args -s_cmp_eq_u32 s72, 0 // Is kernel args +s_and_b32 s64, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s65, s2, 0x1e // Get arg type +s_mov_b32 s66, s3 // Preload internal args +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_Preload_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 @@ -585,9 +541,9 @@ s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 -s_load_dwordx8 s[44:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_mov_b64 s[20:21], s[6:7] // move preload data to correct sgpr s_mov_b64 s[22:23], s[8:9] // move preload data to correct sgpr s_mov_b64 s[24:25], s[10:11] // move preload data to correct sgpr @@ -597,90 +553,90 @@ label_Preload_HBMArgs: s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments label_Preload_LoadArgsEnd: s_mov_b32 s[sgprWGM], s4 // Preload internal args2 -s_mov_b32 s74, s5 // Load num of WGs +s_mov_b32 s67, s5 // Load num of WGs label_common_kernel_entry: /// for both preload/non-preload common code s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id -s_and_b32 s[sgprStaggerU], s73, 0xffff0000 // Restore StaggerU related vars +s_and_b32 s[sgprStaggerU], s66, 0xffff0000 // Restore StaggerU related vars s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 -s_mov_b32 s[sgprArgType], s72 +s_mov_b32 s[sgprArgType], s65 s_mov_b32 m0, 0x20800 // LDS clamp at 133120 bytes v_mov_b32 v[vgprSerial], v0 // thread serial id /* remap workgroup to XCCs */ -s_lshr_b32 s80, s[sgprWGM], 0x10 // Get WGMXCC -s_ff1_i32_b32 s80, s80 // Get log(WGMXCC) -s_lshr_b32 s81, s[sgprWGM], 0x16 // Get CU_Count +s_lshr_b32 s72, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s72, s72 // Get log(WGMXCC) +s_lshr_b32 s73, s[sgprWGM], 0x16 // Get CU_Count /* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ -s_cmp_gt_i32 s80, 0 +s_cmp_gt_i32 s72, 0 s_cbranch_scc0 label_skip_WGMXCC /* only remap WGs in the range */ -s_lshr_b32 s77, s74, s80 -s_lshl_b32 s77, s77, s80 -s_cmp_ge_u32 s[sgprWorkGroup0], s77 +s_lshr_b32 s69, s67, s72 +s_lshl_b32 s69, s69, s72 +s_cmp_ge_u32 s[sgprWorkGroup0], s69 s_cbranch_scc1 label_skip_WGMXCC -s_cmp_eq_u32 s81, 0 // CU_Count == 0 ? +s_cmp_eq_u32 s73, 0 // CU_Count == 0 ? s_cbranch_scc0 label_XCCG_nonzero -s_lshr_b32 s77, s[sgprWorkGroup0], s80 -s_bfm_b32 s78, s80, 0 -s_and_b32 s78, s[sgprWorkGroup0], s78 -s_lshr_b32 s79, s74, s80 -s_mul_i32 s78, s78, s79 -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_lshr_b32 s69, s[sgprWorkGroup0], s72 +s_bfm_b32 s70, s72, 0 +s_and_b32 s70, s[sgprWorkGroup0], s70 +s_lshr_b32 s71, s67, s72 +s_mul_i32 s70, s70, s71 +s_add_u32 s[sgprWorkGroup0], s69, s70 s_branch label_skip_WGMXCC label_XCCG_nonzero: /* temp0 = (wg//CU_Count)*CU_Count */ -v_cvt_f32_u32 v4, s81 // wg//CU_Count +v_cvt_f32_u32 v4, s73 // wg//CU_Count v_rcp_iflag_f32 v4, v4 // wg//CU_Count v_cvt_f32_u32 v5, s[sgprWorkGroup0] // wg//CU_Count v_mul_f32 v4, v4, v5 // wg//CU_Count v_cvt_u32_f32 v4, v4 // wg//CU_Count -v_mul_u32_u24 v5, v4, s81 // wg//CU_Count +v_mul_u32_u24 v5, v4, s73 // wg//CU_Count v_sub_u32 v5, s[sgprWorkGroup0], v5 // wg//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // wg//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // wg//CU_Count v_add_u32 v4, 1, v4 // wg//CU_Count v_mov_b32 v5, 0 // wg//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s81 // re-calculate remainder +v_mul_u32_u24 v5, v4, s73 // re-calculate remainder v_sub_u32 v5, s[sgprWorkGroup0], v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s77, v4 // quotient -v_readfirstlane_b32 s78, v5 // remainder -s_mul_i32 s77, s77, s81 +v_readfirstlane_b32 s69, v4 // quotient +v_readfirstlane_b32 s70, v5 // remainder +s_mul_i32 s69, s69, s73 /* temp1 = (wg%CU_Count)//WGMXCC */ -s_lshr_b32 s78, s78, s80 +s_lshr_b32 s70, s70, s72 /* temp0 = temp0 + temp1 */ -s_add_u32 s77, s77, s78 +s_add_u32 s69, s69, s70 /* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ -v_cvt_f32_u32 v4, s81 // WGs//CU_Count +v_cvt_f32_u32 v4, s73 // WGs//CU_Count v_rcp_iflag_f32 v4, v4 // WGs//CU_Count -v_cvt_f32_u32 v5, s74 // WGs//CU_Count +v_cvt_f32_u32 v5, s67 // WGs//CU_Count v_mul_f32 v4, v4, v5 // WGs//CU_Count v_cvt_u32_f32 v4, v4 // WGs//CU_Count -v_mul_u32_u24 v5, v4, s81 // WGs//CU_Count -v_sub_u32 v5, s74, v5 // WGs//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // WGs//CU_Count +v_mul_u32_u24 v5, v4, s73 // WGs//CU_Count +v_sub_u32 v5, s67, v5 // WGs//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // WGs//CU_Count v_add_u32 v4, 1, v4 // WGs//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s78, v4 // quotient -s_mul_i32 s78, s78, s81 -s_sub_u32 s79, s74, s78 -s_cmp_gt_u32 s[sgprWorkGroup0], s78 -s_cselect_b32 s78, s79, s81 -s_lshr_b32 s78, s78, s80 -s_bfm_b32 s79, s80, 0 -s_and_b32 s79, s[sgprWorkGroup0], s79 -s_mul_i32 s78, s78, s79 +v_readfirstlane_b32 s70, v4 // quotient +s_mul_i32 s70, s70, s73 +s_sub_u32 s71, s67, s70 +s_cmp_gt_u32 s[sgprWorkGroup0], s70 +s_cselect_b32 s70, s71, s73 +s_lshr_b32 s70, s70, s72 +s_bfm_b32 s71, s72, 0 +s_and_b32 s71, s[sgprWorkGroup0], s71 +s_mul_i32 s70, s70, s71 /* WorkGroup0 = temp0 + temp1 */ -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_add_u32 s[sgprWorkGroup0], s69, s70 label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap -s_cmp_eq_u32 s72, 0 +s_cmp_eq_u32 s65, 0 s_cbranch_scc0 label_MultiGemm /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -710,97 +666,98 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args s_branch label_MultiGemmEnd label_MultiGemm: /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_IsExternalValid // branch if ArgType == 2 -s_mov_b32 s11, 188 -s_mul_i32 s78, s67, 4 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 160 +s_mul_i32 s72, s64, 4 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] s_branch label_IsExternalValidEnd label_IsExternalValid: -s_mov_b32 s11, 244 -s_mov_b32 s78, 0 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 216 +s_mov_b32 s72, 0 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] label_IsExternalValidEnd: /* Grouped Gemm:: prefetch 1 arg load */ s_mov_b32 s10, 1 -s_mov_b32 s79, 0 -s_load_dwordx4 s[20:23], s[72:73], s78 -s_cmpk_eq_u32 s67, 1 // if gemm_count is 1? +s_mov_b32 s73, 0 +s_load_dwordx4 s[20:23], s[66:67], s72 +s_cmpk_eq_u32 s64, 1 // if gemm_count is 1? s_cbranch_scc1 label_wgTable_noLoadLoop /* Grouped Gemm:: accumulate numTiles for each gemm */ /* Grouped Gemm:: loop start */ label_Loop_GemmCount: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 -s_cmp_lt_u32 s[sgprWorkGroup0], s79 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 +s_cmp_lt_u32 s[sgprWorkGroup0], s73 s_cbranch_scc1 label_FOUND -s_add_u32 s78, s78, s11 -s_load_dwordx4 s[20:23], s[72:73], s78 +s_add_u32 s72, s72, s11 +s_load_dwordx4 s[20:23], s[66:67], s72 s_add_u32 s10, s10, 1 -s_cmp_lt_u32 s10, s67 +s_cmp_lt_u32 s10, s64 s_cbranch_scc1 label_Loop_GemmCount /* Grouped Gemm:: noLoadLoop */ label_wgTable_noLoadLoop: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 /* Grouped Gemm:: gemmIndex found */ label_FOUND: -s_sub_u32 s73, s10, 1 -s_sub_u32 s72, s79, s76 -s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s72 +s_sub_u32 s67, s10, 1 +s_sub_u32 s66, s73, s70 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalStruct // branch if ArgType == 2 /* Grouped Gemm: offset argument address to gemm */ /* Grouped Gemm: offset address from wg_table_start to args_start */ -s_lshl2_add_u32 s[sgprKernArgAddress], s67, s[sgprKernArgAddress] +s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress] s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 188 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 160 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_branch label_LoadExternalStructEnd label_LoadExternalStruct: /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 244 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 216 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dword s56, s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 // Read Beta -s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 160 // 160 +s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132 label_LoadExternalStructEnd: /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -830,7 +787,7 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args /* Early stop if N(SizeFreeJ) == 0 */ s_cmp_eq_u32 s[sgprSizeJ], 0 @@ -840,26 +797,17 @@ s_endpgm label_NoEarlyStop_N0: label_MultiGemmEnd: -.set sgprSrdA, 72 -.set sgprSrdB, 76 -.set sgprShadowLimitA, 80 -.set sgprShadowLimitB, 82 -.set sgprStaggerUIter, 67 -.set sgprWrapUA, sgprKernArgAddress -.set sgprWrapUB, 84 -.set sgprGlobalReadIncsA, 86 -.set sgprGlobalReadIncsB, 87 -.set sgprScalarGlobalReadOffsetA, 88 -.set sgprScalarGlobalReadOffsetB, 95 - -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - +.set sgprSrdA, 64 +.set sgprSrdB, 68 +.set sgprShadowLimitA, 72 +.set sgprShadowLimitB, 74 +.set sgprStaggerUIter, 76 +.set sgprWrapUA, 77 +.set sgprWrapUB, 79 +.set sgprGlobalReadIncsA, 81 +.set sgprGlobalReadIncsB, 82 +.set sgprScalarGlobalReadOffsetA, 83 +.set sgprScalarGlobalReadOffsetB, 90 s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift @@ -873,28 +821,30 @@ label_AlphaNonZero: s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0] // Save original StreamK index s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do) s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do) -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_cmp_lt_u32 s[sgpr104], s[sgprTotalIters] // Check if there are DP tiles to do +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_cmp_lt_u32 s97, s[sgprTotalIters] // Check if there are DP tiles to do s_cbranch_scc1 label_SK_InitDone // Done init s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr105], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr104], s[sgprStreamKIdx], s[sgpr105] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr105], s[sgpr104], s[sgpr105] // StreamK ending iteration (case: before extra iters) +s_add_u32 s98, s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s97, s[sgprStreamKIdx], s98 // StreamK starting iteration (case: before extra iters) +s_add_u32 s98, s97, s98 // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr104], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr105], s[sgprStreamKIterEnd] // Set end iter -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr104] // Cap ending iter at total SK iters +s_cselect_b32 s[sgprStreamKIter], s97, s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s98, s[sgprStreamKIterEnd] // Set end iter +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s97 // Cap ending iter at total SK iters label_SK_InitDone: s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc0 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_KernelEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_KernelEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_KernelEnd label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ @@ -902,19 +852,15 @@ label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ label_PersistentLoopStart: +// Use sgprScalarGlobalReadOffsetA/B sgprs +.set sgpr102, 84 +.set sgpr103, 85 +.set sgpr104, 86 + /******************************************/ /* Begin setupNewTile */ /******************************************/ -// Use sgprScalarGlobalReadOffsetA/B sgprs -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - /* global read addresses: work-group */ /* graWorkGroup mapping */ @@ -928,78 +874,106 @@ v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v4 // Set LRA to first b v_xor_b32 v4, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v4 // Set LRA to first buffer offset /* StreamK calculate tile idx and map to WG */ -s_mul_hi_u32 s[sgpr105], s[sgprStreamKIter], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s[sgpr104], s[sgprStreamKIter], s[sgpr106] // s_magic mul, div alg 2 -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_and_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr104], s[sgpr104], s[sgpr106] // sMagicDiv Alg 2 -s_mul_i32 s[sgpr105], s[sgpr104], s[sgprItersPerTile] // Tile start iteration -s_add_u32 s[sgpr106], s[sgpr105], s[sgprItersPerTile] // Tile end iteration -s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s[sgpr105] // Local iteration start -s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s[sgpr106] // 1. (Local) iteration end (SK tile) -s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s[sgpr105] // 2. Local iteration end (SK tile) -s_mul_i32 s[sgpr107], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_sub_u32 s[sgpr107], s[sgprTotalIters], s[sgpr107] // Offset to first SK tile -s_mul_i32 s[sgpr105], s[sgprskGrid], s[sgprItersPerTile] // DP iterations shift -s_add_u32 s[sgpr105], s[sgpr105], s[sgprStreamKIter] // Add DP shift -s_cmp_lt_u32 s[sgpr105], s[sgpr107] // Check if still in DP section +v_cvt_f32_u32 v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_rcp_iflag_f32 v4, v4 // StreamKIter // ItersPerTile +v_cvt_f32_u32 v5, s[sgprStreamKIter] // StreamKIter // ItersPerTile +v_mul_f32 v4, v4, v5 // StreamKIter // ItersPerTile +v_cvt_u32_f32 v4, v4 // StreamKIter // ItersPerTile +v_mul_u32_u24 v5, v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_sub_u32 v5, s[sgprStreamKIter], v5 // StreamKIter // ItersPerTile +v_cmpx_eq_u32 exec, v5, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_add_u32 v4, 1, v4 // StreamKIter // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s98, s[sgprItersPerTile] // Tile start iteration +s_add_u32 s100, s99, s[sgprItersPerTile] // Tile end iteration +s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s99 // Local iteration start +s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s100 // 1. (Local) iteration end (SK tile) +s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s99 // 2. Local iteration end (SK tile) +s_and_b32 s101, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s101, s101, s[sgprItersPerTile] // Total SK iters +s_sub_u32 s101, s[sgprTotalIters], s101 // Offset to first SK tile +s_lshr_b32 s99, s[sgprskGridAndTiles], 0x10 // Get skGrid +s_mul_i32 s99, s99, s[sgprItersPerTile] // DP iterations shift +s_add_u32 s99, s99, s[sgprStreamKIter] // Add DP shift +s_cmp_lt_u32 s99, s101 // Check if still in DP section s_cbranch_scc1 label_SK_UpdateDone // Done update -s_mov_b32 s[sgpr105], s[sgpr106] // SK iterations shift -s_cmp_le_u32 s[sgpr107], s[sgprStreamKIter] // Check if continuing in SK section +s_mov_b32 s99, s100 // SK iterations shift +s_cmp_le_u32 s101, s[sgprStreamKIter] // Check if continuing in SK section s_cbranch_scc1 label_SK_UpdateDone // Done update s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr109], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr108], s[sgprStreamKIdx], s[sgpr109] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr109], s[sgpr108], s[sgpr109] // StreamK ending iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s[sgpr102], s[sgprStreamKIdx], s[sgpr103] // StreamK starting iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgpr102], s[sgpr103] // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr108], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr109], s[sgprStreamKIterEnd] // Set end iter -s_add_u32 s[sgpr105], s[sgprStreamKIter], s[sgpr107] // Offset to start of SK section -s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr107] // Offset to start of SK section +s_cselect_b32 s[sgprStreamKIter], s[sgpr102], s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr103], s[sgprStreamKIterEnd] // Set end iter +s_add_u32 s99, s[sgprStreamKIter], s101 // Offset to start of SK section +s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s101 // Offset to start of SK section s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_KernelEnd +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_KernelEnd, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_KernelEnd label_NoBranch_S4FDBQ587JJL6NOU: label_SK_UpdateDone: -s_mov_b32 s[sgprStreamKIter], s[sgpr105] // Store current iteration +s_mov_b32 s[sgprStreamKIter], s99 // Store current iteration /* Map StreamK tile index to wg0/1/2 */ -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumProblemNumGroupTiles0By1] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup2], s[sgpr105] // wg2 = Tile Idx / problemNumGroupTiles0By1 -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups1] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgpr104], s[sgpr104], s[sgpr105] // remainder -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumberProblemNumGroupTiles0] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup1], s[sgpr105] // wg1 = Tile Idx / problemNumGroupTiles0 -s_mul_i32 s[sgprWorkGroup0], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr104], s[sgprWorkGroup0] // wg0 = Tile Idx % problemNumGroupTiles0 +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles +v_cvt_f32_u32 v4, s99 // TileID // nWG0*nWG1 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0*nWG1 +v_cvt_f32_u32 v5, s98 // TileID // nWG0*nWG1 +v_mul_f32 v4, v4, v5 // TileID // nWG0*nWG1 +v_cvt_u32_f32 v4, v4 // TileID // nWG0*nWG1 +v_mul_u32_u24 v5, v4, s99 // TileID // nWG0*nWG1 +v_sub_u32 v5, s98, v5 // TileID // nWG0*nWG1 +v_cmpx_eq_u32 exec, v5, s99 // TileID // nWG0*nWG1 +v_add_u32 v4, 1, v4 // TileID // nWG0*nWG1 +v_mov_b32 v5, 0 // TileID // nWG0*nWG1 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s99 // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s99 // re-calculate remainder +v_sub_u32 v5, s98, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup2], v4 // quotient +v_readfirstlane_b32 s100, v5 // remainder +v_cvt_f32_u32 v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0 +v_cvt_f32_u32 v5, s98 // TileID // nWG0 +v_mul_f32 v4, v4, v5 // TileID // nWG0 +v_cvt_u32_f32 v4, v4 // TileID // nWG0 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_sub_u32 v5, s98, v5 // TileID // nWG0 +v_cmpx_eq_u32 exec, v5, s[sgprNumWorkGroups0] // TileID // nWG0 +v_add_u32 v4, 1, v4 // TileID // nWG0 +v_mov_b32 v5, 0 // TileID // nWG0 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // re-calculate remainder +v_sub_u32 v5, s98, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck // branch if s[Alpha] != 0 s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_GW_End, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_GW_End +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_GW_End, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_GW_End label_NoBranch_UR8VN3A1SJCPC6PO: s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations label_SKAlphaCheck: @@ -1008,130 +982,130 @@ s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? s_cbranch_scc1 label_WGM // branch if WGM >= 0 -s_abs_i32 s[sgpr108], s[sgprWGM] // abs(WGM) -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_abs_i32 s101, s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup0], s[sgpr107] // WorkGroup0=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup0], s100 // WorkGroup0=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s100, s100, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups0], s[sgpr105] // NumWorkGroups0=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups0], s99 // NumWorkGroups0=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup1] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup1] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder -s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr107], s[sgprWorkGroup0] // WorkGroup0=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s100, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s97 // wg1 += blockId * WGM s_branch label_WGM label_WGMPositive: -s_mov_b32 s[sgpr108], s[sgprWGM] // WGM -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_mov_b32 s101, s[sgprWGM] // WGM +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup1], s[sgpr107] // WorkGroup1=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup1], s100 // WorkGroup1=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s100, s100, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups1], s[sgpr105] // NumWorkGroups1=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups1], s99 // NumWorkGroups1=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup0] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup0] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup0], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup1], v5 // remainder -s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup1], s[sgpr107], s[sgprWorkGroup1] // WorkGroup1=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s100, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s97 // wg1 += blockId * WGM label_WGM: /******************************************/ @@ -1165,8 +1139,8 @@ v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: /* local read addresses: final offsets a */ v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id -s_mov_b32 s[sgpr104], 64 // LSU offset: stride = lsuStride(64) when umlds==True -v_mul_lo_u32 v6, s[sgpr104], v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +s_mov_b32 s97, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s97, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 @@ -1175,7 +1149,7 @@ v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offs v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) -v_mul_lo_u32 v4, s[sgpr104], v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_mul_lo_u32 v4, s97, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 @@ -1290,108 +1264,80 @@ s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0 s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element -// Use sgprScalarGlobalReadOffsetA sgprs -.set sgpr104, sgprSKItersPerWG // skitersperwg, overwrite, 54 -.set sgpr105, sgprskGrid // skgrid, overwrite, 55 -.set sgpr106, sgprMagicNumberProblemNumGroupTiles0 // sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgpr107, sgprMagicShiftProblemNumGroupTiles0 // sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgpr108, sgprMagicShiftItersPerTile // sgprMagicShiftItersPerTile, 50 -.set sgpr109, sgprMagicNumProblemNumGroupTiles0By1 // sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgpr110, sgprWGM // wgm, 7 - -// Save sgpr values to vgpr -v_writelane_b32 v255, s[sgprSKItersPerWG], 0 -s_nop 0 -v_writelane_b32 v255, s[sgprskGrid], 1 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumberProblemNumGroupTiles0], 2 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftProblemNumGroupTiles0], 3 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftItersPerTile], 4 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumProblemNumGroupTiles0By1], 5 -s_nop 0 -v_writelane_b32 v255, s[sgprWGM], 6 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress], 7 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress+1], 8 - /* global read addresses: addresses a */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideAL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideAL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideAL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideAL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideAL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideAL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeI], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideAL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideAL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideA0I], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideA0I], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s101 // sub tileStart s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart -s_lshl_b64 s[sgpr106:sgpr107], s[sgpr106:sgpr107], 1 // tileStart *= BPE -s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_mul_hi_u32 s99, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart +s_lshl_b64 s[100:101], s[100:101], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: addresses b */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideBL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideBL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideBL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideBL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideBL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideBL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeJ], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideBL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideBL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideB1J], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideB1J], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s101 // sub tileStart s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart -s_lshl_b64 s[sgpr106:sgpr107], s[sgpr106:sgpr107], 1 // tileStart *= BPE -s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_mul_hi_u32 s99, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart +s_lshl_b64 s[100:101], s[100:101], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: increments a */ @@ -1405,87 +1351,87 @@ v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck2 // branch if s[Alpha] != 0 s_mov_b32 s[sgprLoopCounterL], 0 // Skip iterations label_SKAlphaCheck2: -s_and_b32 s[sgpr105], 63, s[sgprSizesSum+0] // s[sgpr105] = s[sgprSizesSum+0] % 64 -s_cmp_eq_u32 s[sgpr105], 0 // numIterL == 0 -s_cselect_b32 s[sgpr104], 0, 1 // check if size uses tail loop +s_and_b32 s99, 63, s[sgprSizesSum+0] // s99 = s[sgprSizesSum+0] % 64 +s_cmp_eq_u32 s99, 0 // numIterL == 0 +s_cselect_b32 s98, 0, 1 // check if size uses tail loop s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile -s_cselect_b32 s[sgpr104], s[sgpr104], 0 // this WG runs tail loop -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s[sgpr104] // Adjust loop counter for tail loop +s_cselect_b32 s98, s98, 0 // this WG runs tail loop +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s98 // Adjust loop counter for tail loop s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter -s_and_b32 s[sgpr106], s[sgprStaggerU], 0x1f00 -s_lshr_b32 s[sgpr106], s[sgpr106], 0x8 -s_and_b32 s[sgpr107], s[sgprStaggerU], 0xe000 +s_and_b32 s100, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s100, s100, 0x8 +s_and_b32 s101, s[sgprStaggerU], 0xe000 s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff -s_mov_b32 s[sgpr104], s[sgprStaggerU] // init staggerU +s_mov_b32 s98, s[sgprStaggerU] // init staggerU label_beginStaggerUIter: -s_lshl_b32 s[sgpr105], s[sgpr104], s[sgpr106] // shift by StaggerUStride -s_cmp_ge_u32 s[sgprOrigLoopCounter], s[sgpr105] // loopCount >= current shift Count +s_lshl_b32 s99, s98, s100 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s99 // loopCount >= current shift Count s_cbranch_scc1 label_endStaggerUIter // jump to end -s_lshr_b32 s[sgpr104], s[sgpr104], 1 // step down to smaller stagger +s_lshr_b32 s98, s98, 1 // step down to smaller stagger s_branch label_beginStaggerUIter // jump to begin label_endStaggerUIter: -s_sub_u32 s[sgpr105], s[sgpr104], 1 // staggerU mask -s_cmp_ge_u32 s[sgpr104], 1 // if current staggerU >= 1 -s_cselect_b32 s[sgprStaggerUIter], s[sgpr105], 0 // set Mask -s_cmp_eq_u32 s[sgpr107], 0x0 +s_sub_u32 s99, s98, 1 // staggerU mask +s_cmp_ge_u32 s98, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s99, 0 // set Mask +s_cmp_eq_u32 s101, 0x0 s_cbranch_scc1 label_StaggerUMapping_1 -s_mov_b32 s[sgpr104], s[sgprWorkGroup0] +s_mov_b32 s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_1: -s_cmp_eq_u32 s[sgpr107], 0x2000 +s_cmp_eq_u32 s101, 0x2000 s_cbranch_scc1 label_StaggerUMapping_2 -s_mov_b32 s[sgpr104], s[sgprWorkGroup1] +s_mov_b32 s98, s[sgprWorkGroup1] s_branch label_staggerInputEnd label_StaggerUMapping_2: -s_cmp_eq_u32 s[sgpr107], 0x4000 +s_cmp_eq_u32 s101, 0x4000 s_cbranch_scc1 label_StaggerUMapping_3 -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_StaggerUMapping_3: -s_cmp_eq_u32 s[sgpr107], 0x6000 +s_cmp_eq_u32 s101, 0x6000 s_cbranch_scc1 label_StaggerUMapping_4 -s_mul_i32 s[sgpr105], s[sgprNumWorkGroups0], s[sgprWorkGroup1] -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_add_u32 s[sgpr104], s[sgpr104], s[sgprWorkGroup0] +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s98, s98, s99 +s_add_u32 s98, s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_4: -s_cmp_eq_u32 s[sgpr107], 0x8000 +s_cmp_eq_u32 s101, 0x8000 s_cbranch_scc1 label_staggerInputEnd -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_staggerInputEnd: -s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr104] // Compute actual stagger start for this tile -s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr106] // shift by StaggerUStride +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s98 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s100 // shift by StaggerUStride s_cmp_gt_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles /* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap @@ -1535,26 +1481,26 @@ s_add_u32 m0, m0, 4160 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 /* global read inc A loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* global read inc B loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -1571,28 +1517,28 @@ s_mov_b32 s[sgprSrdC+2], BufferOOB s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s[sgpr106], MT1, s[sgprWorkGroup1] // <- wg1*MT1 -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s[sgpr105] // add hi to SRD - -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgpr105] // add hi to SRD +s_mul_i32 s100, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s99, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s99 // add hi to SRD + +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s99 // add hi to SRD // Init C @@ -1635,11 +1581,11 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? /* after InitC, skip to end of prefetch last iter if numIter==0 */ s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU // Only branch on scc1 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_PrefetchGlobalLastIterEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_PrefetchGlobalLastIterEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_PrefetchGlobalLastIterEnd label_NoBranch_8S4L1KCK9VFC7AQU: s_waitcnt vmcnt(0) // wait for global read s_barrier // For stream-k / persistent loop @@ -1655,7 +1601,6 @@ s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap R s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // PGR=2 but only 1 loop s_cbranch_scc1 label_skipPGR2 // PGR=2 but only 1 loop - s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 s_add_u32 m0, m0, 4160 // Move LDS write address to next line @@ -1721,6 +1666,14 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 s_waitcnt lgkmcnt(0) + +/* local read inc a */ +/* N/A, lro->32 */ +/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ + +/* local read inc b */ +/* N/A, lro->32 */ +/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ /******************************************/ /* Unrolled Loop(s) - Begin */ @@ -1730,7 +1683,7 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter s_cbranch_scc1 label_toPGR1 // PGR=2 but only 1 loop, toPGR1 s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter s_cbranch_scc1 label_LoopEndL // do not enter LoopL - +label_LoopBeginL: // MAIN LOOP MACRO - Shared code between Even/Odd simds .macro MAINLOOP isOdd @@ -1742,27 +1695,27 @@ ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] off v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? /* mfmaIndex:2 */ v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:3 */ v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) /* mfmaIndex:4 */ v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:5 */ v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) /* mfmaIndex:6 */ v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:7 */ v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? /* mfmaIndex:8 */ v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] @@ -1777,25 +1730,25 @@ v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0 ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:11 */ v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? /* mfmaIndex:12 */ v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:13 */ v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:14 */ v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:15 */ v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) /* mfmaIndex:16 */ v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? /* mfmaIndex:17 */ v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] @@ -2432,8 +2385,8 @@ v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+ // EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path -s_getreg_b32 s[sgpr104], hwreg(HW_REG_HW_ID, 4, 1) -s_cmp_eq_u32 s[sgpr104], 0 +s_getreg_b32 s98, hwreg(HW_REG_HW_ID, 4, 1) +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_LoopBeginL1 /******************************************/ @@ -2455,8 +2408,6 @@ s_cbranch_scc0 label_LoopBeginL1 // restart LoopL label_LoopEndL: -/* Before NLL: Check VGPR.checkin for INT8 LW */ - /******************************************/ /* Ord. NoGlobalLoadLoop - Begin */ /******************************************/ @@ -2473,7 +2424,7 @@ ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] off v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? /* mfmaIndex:2 */ v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] @@ -2481,8 +2432,8 @@ ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] off /* mfmaIndex:3 */ v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) /* mfmaIndex:4 */ v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] @@ -2490,8 +2441,8 @@ ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] off /* mfmaIndex:5 */ v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) /* mfmaIndex:6 */ v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] @@ -2499,7 +2450,7 @@ ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] off /* mfmaIndex:7 */ v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? /* mfmaIndex:8 */ @@ -2518,8 +2469,8 @@ ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] o /* mfmaIndex:11 */ v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? /* mfmaIndex:12 */ v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] @@ -2527,8 +2478,8 @@ ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] o /* mfmaIndex:13 */ v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:14 */ v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] @@ -2536,8 +2487,8 @@ ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] o /* mfmaIndex:15 */ v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) /* mfmaIndex:16 */ /* localReadsVacancy: latencyLeft 1 */ @@ -3259,12 +3210,12 @@ label_PrefetchGlobalLastIterEnd: /******************************************/ /* local write reset offsets a */ -s_xor_b32 s[sgpr104], s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s97 // Set LWA to first buffer offset /* local write reset offsets b */ -s_xor_b32 s[sgpr104], s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s97 // Set LWA to first buffer offset /* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */ .set vgprG2LA_BASE, 4 .set vgprG2LA, vgprG2LA_BASE+0 @@ -3283,56 +3234,56 @@ s_mov_b32 s[sgprOrigLoopCounter], 0 // repurpose to count each lo s_cbranch_scc1 label_SkipTailLoopL // skip to end of tail loop b/c numIter==0 /* remove stagger offsets for tail loop */ -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL label_Negative_J5DQFVGFWLXU2DUR: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_DLSAQLEVYLOBCPNL: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUA] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUA+1] // S - WrapU -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUA] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUA+1] // S - WrapU +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1 -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB label_Negative_LQI6BOBE0EY8XIP1: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_9N1QELR2XL4Z0HRB: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUB] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUB+1] // S - WrapU -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUB] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUB+1] // S - WrapU +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 // Check if K even/odd -s_and_b32 s84, s[sgprSizesSum], 1 -s_cmp_eq_u32 s84, 0 +s_and_b32 s98, s[sgprSizesSum], 1 +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_tailloop_non_dtl label_tailloop_dtl: @@ -3711,286 +3662,286 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x40 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) +s_mov_b32 s97, 0x40 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 7 // get inputs for edge thread -s_sub_u32 s[sgpr106], 8, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 4 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 7 // get inputs for edge thread +s_sub_u32 s97, 8, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 4 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+4+0+0+0:vgprValuA_X0_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+4+0+0+2:vgprValuA_X0_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+4+0+0+0:vgprValuA_X0_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+4+0+0+2:vgprValuA_X0_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+12+0+0+0:vgprValuA_X0_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+12+0+0+2:vgprValuA_X0_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+12+0+0+0:vgprValuA_X0_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+12+0+0+2:vgprValuA_X0_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+20+0+0+0:vgprValuA_X0_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+20+0+0+2:vgprValuA_X0_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+20+0+0+0:vgprValuA_X0_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+20+0+0+2:vgprValuA_X0_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+28+0+0+0:vgprValuA_X0_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+28+0+0+2:vgprValuA_X0_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+28+0+0+0:vgprValuA_X0_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+28+0+0+2:vgprValuA_X0_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+4+0+0+0:vgprValuB_X0_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+4+0+0+2:vgprValuB_X0_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+4+0+0+0:vgprValuB_X0_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+4+0+0+2:vgprValuB_X0_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+12+0+0+0:vgprValuB_X0_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+12+0+0+2:vgprValuB_X0_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+12+0+0+0:vgprValuB_X0_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+12+0+0+2:vgprValuB_X0_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+20+0+0+0:vgprValuB_X0_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+20+0+0+2:vgprValuB_X0_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+20+0+0+0:vgprValuB_X0_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+20+0+0+2:vgprValuB_X0_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+28+0+0+0:vgprValuB_X0_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+28+0+0+2:vgprValuB_X0_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+28+0+0+0:vgprValuB_X0_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+28+0+0+2:vgprValuB_X0_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], v141, s[98:99] s_nop 1 v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] @@ -4086,286 +4037,286 @@ ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x40 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) +s_mov_b32 s97, 0x40 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 7 // get inputs for edge thread -s_sub_u32 s[sgpr106], 8, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 4 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+0+0+0+0:vgprValuA_X1_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+0+0+0+2:vgprValuA_X1_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 7 // get inputs for edge thread +s_sub_u32 s97, 8, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 4 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+0+0+0+0:vgprValuA_X1_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+0+0+0+2:vgprValuA_X1_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+4+0+0+0:vgprValuA_X1_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+4+0+0+2:vgprValuA_X1_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+4+0+0+0:vgprValuA_X1_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+4+0+0+2:vgprValuA_X1_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+8+0+0+0:vgprValuA_X1_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+8+0+0+2:vgprValuA_X1_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+8+0+0+0:vgprValuA_X1_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+8+0+0+2:vgprValuA_X1_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+12+0+0+0:vgprValuA_X1_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+12+0+0+2:vgprValuA_X1_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+12+0+0+0:vgprValuA_X1_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+12+0+0+2:vgprValuA_X1_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+16+0+0+0:vgprValuA_X1_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+16+0+0+2:vgprValuA_X1_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+16+0+0+0:vgprValuA_X1_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+16+0+0+2:vgprValuA_X1_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+20+0+0+0:vgprValuA_X1_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+20+0+0+2:vgprValuA_X1_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+20+0+0+0:vgprValuA_X1_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+20+0+0+2:vgprValuA_X1_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+24+0+0+0:vgprValuA_X1_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+24+0+0+2:vgprValuA_X1_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+24+0+0+0:vgprValuA_X1_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+24+0+0+2:vgprValuA_X1_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+28+0+0+0:vgprValuA_X1_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+28+0+0+2:vgprValuA_X1_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+28+0+0+0:vgprValuA_X1_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+28+0+0+2:vgprValuA_X1_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+0+0+0+0:vgprValuB_X1_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+0+0+0+2:vgprValuB_X1_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+0+0+0+0:vgprValuB_X1_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+0+0+0+2:vgprValuB_X1_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+4+0+0+0:vgprValuB_X1_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+4+0+0+2:vgprValuB_X1_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+4+0+0+0:vgprValuB_X1_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+4+0+0+2:vgprValuB_X1_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+8+0+0+0:vgprValuB_X1_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+8+0+0+2:vgprValuB_X1_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+8+0+0+0:vgprValuB_X1_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+8+0+0+2:vgprValuB_X1_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+12+0+0+0:vgprValuB_X1_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+12+0+0+2:vgprValuB_X1_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+12+0+0+0:vgprValuB_X1_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+12+0+0+2:vgprValuB_X1_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+16+0+0+0:vgprValuB_X1_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+16+0+0+2:vgprValuB_X1_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+16+0+0+0:vgprValuB_X1_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+16+0+0+2:vgprValuB_X1_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+20+0+0+0:vgprValuB_X1_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+20+0+0+2:vgprValuB_X1_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+20+0+0+0:vgprValuB_X1_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+20+0+0+2:vgprValuB_X1_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+24+0+0+0:vgprValuB_X1_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+24+0+0+2:vgprValuB_X1_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+24+0+0+0:vgprValuB_X1_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+24+0+0+2:vgprValuB_X1_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+28+0+0+0:vgprValuB_X1_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+28+0+0+2:vgprValuB_X1_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+28+0+0+0:vgprValuB_X1_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+28+0+0+2:vgprValuB_X1_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], v141, s[98:99] s_nop 1 v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] @@ -4438,12 +4389,12 @@ s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x20 // inc counterL s_cmp_le_i32 s[sgprLoopCounterL], 0x0 // counterL<=0 s_cbranch_scc0 label_TailLoopBeginL // restart LoopL label_TailLoopEndL: -s_mov_b32 s[sgpr104], 2 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgpr104] // remove lro damage -s_mov_b32 s[sgpr104], 2 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgpr104] // remove lro damage +s_mov_b32 s97, 2 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s97 // remove lro damage +s_mov_b32 s97, 2 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s97 // remove lro damage label_SkipTailLoopL: .set vgprValuA_X0_I0_BASE, UNDEF .set vgprValuA_X0_I0, UNDEF @@ -4454,11 +4405,11 @@ label_SkipTailLoopL: label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprLoopCounterL, UNDEF .set sgprOrigLoopCounter, UNDEF -.set sgprStaggerUIter, UNDEF .set sgprSrdA, UNDEF .set sgprSrdB, UNDEF .set sgprShadowLimitA, UNDEF .set sgprShadowLimitB, UNDEF +.set sgprStaggerUIter, UNDEF .set sgprWrapUA, UNDEF .set sgprWrapUB, UNDEF .set sgprGlobalReadIncsA, UNDEF @@ -4466,54 +4417,27 @@ label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprScalarGlobalReadOffsetA, UNDEF .set sgprScalarGlobalReadOffsetB, UNDEF /* load store sgprs */ -.set sgprAddressScaleAlphaVec, 72 -.set sgprAddressBias, 74 -.set sgprBiasType, 76 -.set sgprBiasStride, 77 -.set sgpractivationAlpha, 78 -.set sgpractivationBeta, 79 -.set sgprActivationType, 80 - -v_readlane_b32 s[sgprSKItersPerWG], v255, 0 -s_nop 0 -v_readlane_b32 s[sgprskGrid], v255, 1 -s_nop 0 -v_readlane_b32 s[sgprMagicNumberProblemNumGroupTiles0], v255, 2 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftProblemNumGroupTiles0], v255, 3 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftItersPerTile], v255, 4 -s_nop 0 -v_readlane_b32 s[sgprMagicNumProblemNumGroupTiles0By1], v255, 5 -s_nop 0 -v_readlane_b32 s[sgprWGM], v255, 6 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress], v255, 7 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress+1], v255, 8 - -.set sgpr104, UNDEF -.set sgpr105, UNDEF -.set sgpr106, UNDEF -.set sgpr107, UNDEF -.set sgpr108, UNDEF -.set sgpr109, UNDEF -.set sgpr110, UNDEF - +.set sgprAddressScaleAlphaVec, 64 +.set sgprAddressBias, 66 +.set sgprBiasType, 68 +.set sgprBiasStride, 69 +.set sgpractivationAlpha, 70 +.set sgpractivationBeta, 71 +.set sgprActivationType, 72 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalEpilogueStruct // branch if ArgType == 2 -s_load_dwordx8 s[72:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 152 // 152 -s_load_dword s80, s[sgprKernArgAddress:sgprKernArgAddress+1], 184 // 184 +s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124 +s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156 s_branch label_LoadExternalEpilogueStructEnd label_LoadExternalEpilogueStruct: -s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 208 // 208 -s_load_dwordx2 s[76:77], s[sgprKernArgAddress:sgprKernArgAddress+1], 224 // 224 -s_load_dwordx2 s[78:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 248 // 248 -s_load_dword s80, s[sgprKernArgAddress:sgprKernArgAddress+1], 256 // 256 +s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180 +s_load_dwordx2 s[68:69], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196 +s_load_dwordx2 s[70:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220 +s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228 label_LoadExternalEpilogueStructEnd: -.set sgprSrdScaleAlphaVec, 84 -.set sgprSrdBias, 88 +.set sgprSrdScaleAlphaVec, 76 +.set sgprSrdBias, 80 /* Mapping of Acc register -> C Vgpr register */ @@ -4628,34 +4552,44 @@ label_Load_Bias_End: .set sgprSrdScaleAlphaVec, UNDEF s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23 // Only branch on scc0 -s_getpc_b64 s[92:93] // addr of next instr -s_add_i32 s94, label_SK_Partials, 4 // target branch offset -s_add_u32 s92, s92, s94 // add target branch offset -s_addc_u32 s93, s93, 0 // add high and carry -s_setpc_b64 s[92:93] // branch to label_SK_Partials +s_getpc_b64 s[84:85] // addr of next instr +s_add_i32 s86, label_SK_Partials, 4 // target branch offset +s_add_u32 s84, s84, s86 // add target branch offset +s_addc_u32 s85, s85, 0 // add high and carry +s_setpc_b64 s[84:85] // branch to label_SK_Partials label_NoBranch_QWMA7J3AUDGL0X23: s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cbranch_scc1 label_SK_Store // Branch if started and finished tile, go to regular store code -s_add_u32 s67, s[sgprStreamKIdx], 1 // input partial tile index -s_mul_hi_u32 s82, s[sgprStreamKIterEnd], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s83, s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s81, s[sgprStreamKIterEnd], s83 // s_magic mul, div alg 2 -s_add_u32 s81, s81, s82 -s_and_b32 s83, s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s81, s81, s83 // sMagicDiv Alg 2 -s_mul_i32 s81, s81, s[sgprItersPerTile] // start iteration of partial tile -s_sub_u32 s85, s[sgprStreamKIterEnd], s81 // calc iterations completed by this WG +s_add_u32 s77, s[sgprStreamKIdx], 1 // input partial tile index +v_cvt_f32_u32 v21, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_rcp_iflag_f32 v21, v21 // StreamKIterEnd // ItersPerTile +v_cvt_f32_u32 v22, s[sgprStreamKIterEnd] // StreamKIterEnd // ItersPerTile +v_mul_f32 v21, v21, v22 // StreamKIterEnd // ItersPerTile +v_cvt_u32_f32 v21, v21 // StreamKIterEnd // ItersPerTile +v_mul_u32_u24 v22, v21, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_sub_u32 v22, s[sgprStreamKIterEnd], v22 // StreamKIterEnd // ItersPerTile +v_cmpx_eq_u32 exec, v22, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_add_u32 v21, 1, v21 // StreamKIterEnd // ItersPerTile +v_mov_b32 v22, 0 // StreamKIterEnd // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v22, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v21, v21, 1 // quotient - 1 +v_mul_u32_u24 v22, v21, s[sgprItersPerTile] // re-calculate remainder +v_sub_u32 v22, s[sgprStreamKIterEnd], v22 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s73, v21 // quotient +v_readfirstlane_b32 s78, v22 // remainder label_SK_Fixup: -s_lshl_b32 s81, s67, 2 // flag offset based on CTA index -s_load_dword s83, s[sgprAddressFlags:sgprAddressFlags+1], s81 glc // get flag +s_lshl_b32 s73, s77, 2 // flag offset based on CTA index +s_load_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // get flag s_waitcnt lgkmcnt(0) // wait for flag load -s_cmp_eq_u32 s83, 1 // check if ready +s_cmp_eq_u32 s75, 1 // check if ready s_cbranch_scc0 label_SK_Fixup // if flag not set, wait and check again s_barrier // wait for all workgroups before resetting flag -v_readfirstlane_b32 s83, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s83, 0 // Check for wave 0 +v_readfirstlane_b32 s75, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s75, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagReset // Skip flag reset -s_store_dword s83, s[sgprAddressFlags:sgprAddressFlags+1], s81 glc // reset flag +s_store_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // reset flag label_SK_SkipFlagReset: label_Fixup_E0: @@ -4664,8 +4598,8 @@ s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // s_mov_b32 s[sgprSrdWS+2], BufferOOB s_mov_b32 s[sgprSrdWS+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s82, 0x40000, s67 // Offset to correct partials tile -s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s82 // add lo to SRD +s_mul_i32 s74, 0x40000, s77 // Offset to correct partials tile +s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s74 // add lo to SRD s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ @@ -4676,42 +4610,42 @@ s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* calc coords, apply mask, and issue loads (if necessary) */ v_lshlrev_b32 v22, 5, v[vgprSerial] // v22 = v[vgprSerial] * 32 -s_mov_b32 s82, 0 // Init sgpr offset -buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_mov_b32 s74, 0 // Init sgpr offset +buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -5040,42 +4974,42 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -5404,30 +5338,30 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[88:91], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[92:95], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[96:99], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[100:103], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[104:107], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[108:111], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[112:115], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[116:119], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[88:91], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[92:95], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[96:99], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[100:103], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[104:107], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[108:111], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[112:115], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[116:119], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[120:123], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v22, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -5644,42 +5578,42 @@ v_accvgpr_write_b32 acc251, v[vgprValuC+86] // copy vreg[254] to acc v_accvgpr_write_b32 acc255, v[vgprValuC+87] // copy vreg[255] to acc s_nop 1 // 2 wait states required before reading vgpr s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_add_u32 s81, s[sgprSKItersPerWG], 1 // Add extra iter -s_cmp_lt_u32 s67, s[sgprskExtraIters] // Check if next WG had an extra iteration -s_cselect_b32 s81, s81, s[sgprSKItersPerWG] // Select correct number of iterations for next WG -s_add_u32 s85, s85, s81 // next partial tile iteration -s_add_u32 s67, s67, 1 // next partial tile index -s_cmp_lt_u32 s85, s[sgprItersPerTile] // done loading partial tiles? +s_add_u32 s73, s[sgprSKItersPerWG], 1 // Add extra iter +s_cmp_lt_u32 s77, s[sgprskExtraIters] // Check if next WG had an extra iteration +s_cselect_b32 s73, s73, s[sgprSKItersPerWG] // Select correct number of iterations for next WG +s_add_u32 s78, s78, s73 // next partial tile iteration +s_add_u32 s77, s77, 1 // next partial tile index +s_cmp_lt_u32 s78, s[sgprItersPerTile] // done loading partial tiles? s_cbranch_scc1 label_SK_Fixup // Branch to continue fixup loop label_SK_Store: s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 s_cbranch_scc0 label_GW_Beta // Branch if Beta is not zero -s_and_b32 s82, 255, s[sgprSizeI] // s82 = s[sgprSizeI] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s83 // wg0 >= nwg0-1 ? -s_cselect_b32 s82, s82, 0 // set rMT0 -s_cmpk_gt_u32 s82, 0 // rMT0 > 0 +s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? +s_cselect_b32 s74, s74, 0 // set rMT0 +s_cmpk_gt_u32 s74, 0 // rMT0 > 0 s_cbranch_scc0 label_NoBranch_0MXDW6EW9K7ZNG8F // Only branch on scc1 // jump if edges required -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_GW_B0_E1_M, 4 // target branch offset -s_add_u32 s82, s82, s84 // add target branch offset -s_addc_u32 s83, s83, 0 // add high and carry -s_setpc_b64 s[82:83] // branch to label_GW_B0_E1_M +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_GW_B0_E1_M, 4 // target branch offset +s_add_u32 s74, s74, s76 // add target branch offset +s_addc_u32 s75, s75, 0 // add high and carry +s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_M label_NoBranch_0MXDW6EW9K7ZNG8F: -s_and_b32 s82, 255, s[sgprSizeJ] // s82 = s[sgprSizeJ] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s83 // wg1 >= nwg1-1 -s_cselect_b32 s82, s82, 0 // set rMT1 -s_cmpk_gt_u32 s82, 0 // rMT1 > 0 +s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 +s_cselect_b32 s74, s74, 0 // set rMT1 +s_cmpk_gt_u32 s74, 0 // rMT1 > 0 s_cbranch_scc0 label_NoBranch_IXPKU979JKZCQDH3 // Only branch on scc1 // jump if edges required -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_GW_B0_E1_N, 4 // target branch offset -s_add_u32 s82, s82, s84 // add target branch offset -s_addc_u32 s83, s83, 0 // add high and carry -s_setpc_b64 s[82:83] // branch to label_GW_B0_E1_N +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_GW_B0_E1_N, 4 // target branch offset +s_add_u32 s74, s74, s76 // add target branch offset +s_addc_u32 s75, s75, 0 // add high and carry +s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_N label_NoBranch_IXPKU979JKZCQDH3: label_GW_B0_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -5688,28 +5622,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_0 // Branch if true label_To_Activation_None_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Gelu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Relu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Silu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_5 +label_To_Activation_Clamp_VW8_beta_0_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_ActivationSetPCAddrEnd_5: @@ -5724,8 +5666,8 @@ label_ActivationSetPCAddrEnd_5: /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v23, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v23, v4, s74 v_lshlrev_b32 v23, 0x2, v23 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -5854,7 +5796,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -5872,7 +5814,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -5881,8 +5823,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5893,7 +5835,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -5902,8 +5844,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5914,7 +5856,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -5923,8 +5865,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5935,7 +5877,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -5944,8 +5886,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5956,7 +5898,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -5965,8 +5907,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5977,7 +5919,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -5986,8 +5928,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5998,7 +5940,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6007,8 +5949,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6145,7 +6087,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6154,8 +6096,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6166,7 +6108,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6175,8 +6117,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6187,7 +6129,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6196,8 +6138,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6208,7 +6150,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -6217,8 +6159,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6229,7 +6171,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -6238,8 +6180,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6250,7 +6192,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6259,8 +6201,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6271,7 +6213,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6280,8 +6222,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6292,7 +6234,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6301,8 +6243,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6439,7 +6381,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6448,8 +6390,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6460,7 +6402,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6469,8 +6411,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6481,7 +6423,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6490,8 +6432,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6502,7 +6444,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -6511,8 +6453,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6523,7 +6465,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -6532,8 +6474,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6544,7 +6486,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6553,8 +6495,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6565,7 +6507,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6574,8 +6516,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6586,7 +6528,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6595,8 +6537,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6733,7 +6675,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6742,8 +6684,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6754,7 +6696,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6763,8 +6705,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6775,7 +6717,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6784,8 +6726,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6796,7 +6738,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -6805,8 +6747,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6817,7 +6759,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -6826,8 +6768,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6838,7 +6780,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6847,8 +6789,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6859,7 +6801,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6868,8 +6810,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6880,7 +6822,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6889,8 +6831,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6902,28 +6844,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_1 // Branch if true label_To_Activation_None_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Gelu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Relu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Silu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_4 +label_To_Activation_Clamp_VW8_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_ActivationSetPCAddrEnd_4: @@ -6939,11 +6889,11 @@ label_ActivationSetPCAddrEnd_4: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -6952,105 +6902,105 @@ ds_read_b128 v[92:95], v22 offset:16 // load Bias ds_read_b128 v[96:99], v22 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -7163,7 +7113,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -7181,7 +7131,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -7199,7 +7149,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -7217,7 +7167,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7235,7 +7185,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -7253,7 +7203,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -7271,7 +7221,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -7289,7 +7239,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -7315,116 +7265,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[88:91], v22 offset:0 // load Bias ds_read_b128 v[92:95], v22 offset:16 // load Bias ds_read_b128 v[96:99], v22 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc1 // copy acc to vreg[64] v_accvgpr_read_b32 v[vgprValuC+25], acc5 // copy acc to vreg[65] v_accvgpr_read_b32 v[vgprValuC+26], acc9 // copy acc to vreg[66] @@ -7537,7 +7487,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -7555,7 +7505,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -7573,7 +7523,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -7591,7 +7541,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7609,7 +7559,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -7627,7 +7577,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -7645,7 +7595,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -7663,7 +7613,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -7689,116 +7639,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[88:91], v22 offset:0 // load Bias ds_read_b128 v[92:95], v22 offset:16 // load Bias ds_read_b128 v[96:99], v22 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc2 // copy acc to vreg[128] v_accvgpr_read_b32 v[vgprValuC+25], acc6 // copy acc to vreg[129] v_accvgpr_read_b32 v[vgprValuC+26], acc10 // copy acc to vreg[130] @@ -7911,7 +7861,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -7929,7 +7879,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -7947,7 +7897,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -7965,7 +7915,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7983,7 +7933,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -8001,7 +7951,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -8019,7 +7969,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -8037,7 +7987,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -8063,116 +8013,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[88:91], v22 offset:0 // load Bias ds_read_b128 v[92:95], v22 offset:16 // load Bias ds_read_b128 v[96:99], v22 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -8285,7 +8235,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -8303,7 +8253,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -8321,7 +8271,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -8339,7 +8289,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -8357,7 +8307,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -8375,7 +8325,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -8393,7 +8343,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -8411,7 +8361,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -8430,28 +8380,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true label_To_Activation_None_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Gelu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Relu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Silu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_3 +label_To_Activation_Clamp_VW1_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_ActivationSetPCAddrEnd_3: @@ -8467,482 +8425,482 @@ label_ActivationSetPCAddrEnd_3: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v4, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v4, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v4, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+22], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+23], acc8 // copy acc to vreg[2] @@ -9020,265 +8978,265 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v66, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v70, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v74, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v78, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v82, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v86, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v90, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v66, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v70, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v74, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v78, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v82, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v86, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v90, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v66, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v70, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v74, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v78, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v82, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v86, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v90, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v66, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v70, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v74, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v78, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v82, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v86, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v90, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v66, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v70, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v74, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v78, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v82, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v86, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v90, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v94, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v66, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v61, v8 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+62], v70, v[vgprValuC+62] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+62] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v62, v8 v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 buffer_store_short v62, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+63], v74, v[vgprValuC+63] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+63] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v63, v8 v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 buffer_store_short v63, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+64], v78, v[vgprValuC+64] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+64] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v64, v8 v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 buffer_store_short v64, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -9294,480 +9252,480 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v4, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v4, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc176 // copy acc to vreg[44] v_accvgpr_read_b32 v[vgprValuC+22], acc180 // copy acc to vreg[45] v_accvgpr_read_b32 v[vgprValuC+23], acc184 // copy acc to vreg[46] @@ -9845,265 +9803,265 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v66, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v70, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v74, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v78, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v82, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v86, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v90, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v66, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v70, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v74, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v78, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v82, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v86, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v90, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v66, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v70, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v74, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v78, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v82, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v86, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v90, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v66, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v70, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v74, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v78, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v82, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v86, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v90, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v66, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v70, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v74, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v78, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v82, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v86, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v90, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v94, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v66, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v61, v8 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+62], v70, v[vgprValuC+62] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+62] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v62, v8 v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 buffer_store_short v62, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+63], v74, v[vgprValuC+63] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+63] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v63, v8 v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 buffer_store_short v63, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+64], v78, v[vgprValuC+64] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+64] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v64, v8 v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 buffer_store_short v64, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -10123,480 +10081,480 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v4, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v4, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v4, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc97 // copy acc to vreg[88] v_accvgpr_read_b32 v[vgprValuC+22], acc101 // copy acc to vreg[89] v_accvgpr_read_b32 v[vgprValuC+23], acc105 // copy acc to vreg[90] @@ -10674,265 +10632,265 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v66, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v70, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v74, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v78, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v82, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v86, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v90, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v66, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v70, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v74, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v78, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v82, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v86, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v90, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v66, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v70, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v74, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v78, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v82, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v86, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v90, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v66, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v70, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v74, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v78, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v82, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v86, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v90, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v66, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v70, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v74, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v78, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v82, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v86, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v90, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v94, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v66, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v61, v8 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+62], v70, v[vgprValuC+62] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+62] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v62, v8 v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 buffer_store_short v62, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+63], v74, v[vgprValuC+63] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+63] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v63, v8 v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 buffer_store_short v63, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+64], v78, v[vgprValuC+64] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+64] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v64, v8 v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 buffer_store_short v64, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -10948,480 +10906,480 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v4, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v4, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc18 // copy acc to vreg[132] v_accvgpr_read_b32 v[vgprValuC+22], acc22 // copy acc to vreg[133] v_accvgpr_read_b32 v[vgprValuC+23], acc26 // copy acc to vreg[134] @@ -11499,265 +11457,265 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v66, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v70, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v74, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v78, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v82, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v86, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v90, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v66, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v70, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v74, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v78, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v82, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v86, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v90, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v66, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v70, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v74, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v78, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v82, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v86, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v90, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v66, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v70, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v74, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v78, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v82, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v86, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v90, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v66, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v70, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v74, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v78, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v82, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v86, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v90, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v94, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v66, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v61, v8 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+62], v70, v[vgprValuC+62] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+62] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v62, v8 v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 buffer_store_short v62, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+63], v74, v[vgprValuC+63] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+63] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v63, v8 v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 buffer_store_short v63, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+64], v78, v[vgprValuC+64] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+64] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v64, v8 v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 buffer_store_short v64, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -11777,480 +11735,480 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v4, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v4, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v4, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc194 // copy acc to vreg[176] v_accvgpr_read_b32 v[vgprValuC+22], acc198 // copy acc to vreg[177] v_accvgpr_read_b32 v[vgprValuC+23], acc202 // copy acc to vreg[178] @@ -12328,265 +12286,265 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v66, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v70, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v74, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v78, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v82, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v86, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v90, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v66, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v70, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v74, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v78, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v82, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v86, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v90, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v66, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v70, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v74, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v78, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v82, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v86, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v90, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v66, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v70, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v74, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v78, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v82, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v86, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v90, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v66, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v70, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v74, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v78, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v82, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v86, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v90, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v94, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v93, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v66, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v61, v8 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+62], v70, v[vgprValuC+62] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+62] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v62, v8 v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1 buffer_store_short v62, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+63], v74, v[vgprValuC+63] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+63] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v63, v8 v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1 buffer_store_short v63, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+64], v78, v[vgprValuC+64] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+64] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v64, v8 v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1 buffer_store_short v64, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -12602,396 +12560,396 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v60, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v60, v8, s74 v_lshlrev_b32 v60, 0x2, v60 // Bias address scaled by BPE ds_read_b32 v57, v60 offset:0 // load Bias ds_read_b32 v58, v60 offset:1024 // load scaleAlpha v_add_lshl_u32 v59, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v59, v16, v59, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v59, v16, v59, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v4, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v90, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v90, v8, s74 v_lshlrev_b32 v90, 0x2, v90 // Bias address scaled by BPE v_add_lshl_u32 v89, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v16, v89, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v89, v16, v89, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc115 // copy acc to vreg[220] v_accvgpr_read_b32 v[vgprValuC+22], acc119 // copy acc to vreg[221] v_accvgpr_read_b32 v[vgprValuC+23], acc123 // copy acc to vreg[222] @@ -13057,234 +13015,234 @@ v_mov_b32 v19, 0x7fff0000 // fp32 Nan v_mov_b32 v20, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+21], v58, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v62, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v66, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v70, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v74, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v78, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v82, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v86, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v58, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v62, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v66, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v70, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v74, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v78, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v82, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v86, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v58, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v62, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v66, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v70, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v74, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v78, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v82, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v86, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v58, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v62, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v66, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v70, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v74, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v78, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v82, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v86, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v58, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v62, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v66, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v70, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_GW_Beta: -s_and_b32 s82, 255, s[sgprSizeI] // s82 = s[sgprSizeI] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s83 // wg0 >= nwg0-1 ? -s_cselect_b32 s82, s82, 0 // set rMT0 -s_cmpk_gt_u32 s82, 0 // rMT0 > 0 +s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? +s_cselect_b32 s74, s74, 0 // set rMT0 +s_cmpk_gt_u32 s74, 0 // rMT0 > 0 s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required -s_and_b32 s82, 255, s[sgprSizeJ] // s82 = s[sgprSizeJ] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s83 // wg1 >= nwg1-1 -s_cselect_b32 s82, s82, 0 // set rMT1 -s_cmpk_gt_u32 s82, 0 // rMT1 > 0 +s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 +s_cselect_b32 s74, s74, 0 // set rMT1 +s_cmpk_gt_u32 s74, 0 // rMT1 > 0 s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required label_GW_B1_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -13293,28 +13251,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_0 // Branch if true label_To_Activation_None_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Gelu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Relu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Silu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_2 +label_To_Activation_Clamp_VW8_beta_1_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_ActivationSetPCAddrEnd_2: @@ -13331,8 +13297,8 @@ label_ActivationSetPCAddrEnd_2: /* (d1,vc1,d0,vc0)=(0,0,0,0) */ v_add_lshl_u32 v22, v6, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 buffer_load_dwordx4 v[72:75], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v23, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v23, v4, s74 v_lshlrev_b32 v23, 0x2, v23 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -13341,28 +13307,28 @@ ds_read_b128 v[84:87], v23 offset:16 // load Bias ds_read_b128 v[88:91], v23 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,1,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,2,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,3,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,4,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,5,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_add_lshl_u32 v21, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 @@ -13471,7 +13437,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -13507,7 +13473,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -13516,8 +13482,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13546,7 +13512,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -13555,8 +13521,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13585,7 +13551,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -13594,8 +13560,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13624,7 +13590,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -13633,8 +13599,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13663,7 +13629,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -13672,8 +13638,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -13686,8 +13652,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,6,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v23 offset:0 // load Bias @@ -13695,28 +13661,28 @@ ds_read_b128 v[84:87], v23 offset:16 // load Bias ds_read_b128 v[88:91], v23 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,7,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,8,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,9,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,10,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,11,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] @@ -13824,7 +13790,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -13833,8 +13799,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13863,7 +13829,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -13872,8 +13838,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13902,7 +13868,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -13911,8 +13877,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13941,7 +13907,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -13950,8 +13916,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13980,7 +13946,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -13989,8 +13955,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14019,7 +13985,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14028,8 +13994,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14042,8 +14008,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,12,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v23 offset:0 // load Bias @@ -14051,28 +14017,28 @@ ds_read_b128 v[84:87], v23 offset:16 // load Bias ds_read_b128 v[88:91], v23 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,13,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,14,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,15,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,16,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,17,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] @@ -14180,7 +14146,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14189,8 +14155,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14219,7 +14185,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -14228,8 +14194,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14258,7 +14224,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14267,8 +14233,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14297,7 +14263,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -14306,8 +14272,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14336,7 +14302,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -14345,8 +14311,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14375,7 +14341,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14384,8 +14350,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14398,8 +14364,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,18,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v23 offset:0 // load Bias @@ -14407,28 +14373,28 @@ ds_read_b128 v[84:87], v23 offset:16 // load Bias ds_read_b128 v[88:91], v23 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,19,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,20,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,21,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,22,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,23,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] @@ -14536,7 +14502,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14545,8 +14511,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14575,7 +14541,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -14584,8 +14550,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14614,7 +14580,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14623,8 +14589,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14653,7 +14619,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -14662,8 +14628,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14692,7 +14658,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -14701,8 +14667,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14731,7 +14697,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14740,8 +14706,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14754,8 +14720,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,24,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v23 offset:0 // load Bias @@ -14763,28 +14729,28 @@ ds_read_b128 v[84:87], v23 offset:16 // load Bias ds_read_b128 v[88:91], v23 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,25,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,26,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,27,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,28,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,29,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] @@ -14892,7 +14858,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14901,8 +14867,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14931,7 +14897,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -14940,8 +14906,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14970,7 +14936,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14979,8 +14945,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15009,7 +14975,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -15018,8 +14984,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15048,7 +15014,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -15057,8 +15023,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15087,7 +15053,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -15096,8 +15062,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -15110,8 +15076,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,30,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[40:43], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[48:51], v23 offset:0 // load Bias @@ -15119,8 +15085,8 @@ ds_read_b128 v[52:55], v23 offset:16 // load Bias ds_read_b128 v[56:59], v23 offset:1024 // load scaleAlpha ds_read_b128 v[60:63], v23 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,31,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[44:47], v22, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] @@ -15180,7 +15146,7 @@ v_pk_add_f32 v[8:9], v[48:49], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[50:51], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[52:53], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[54:55], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15189,8 +15155,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15219,7 +15185,7 @@ v_pk_add_f32 v[8:9], v[48:49], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[50:51], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[52:53], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[54:55], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15228,8 +15194,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -15241,28 +15207,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_1 // Branch if true label_To_Activation_None_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Gelu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Relu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Silu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_1 +label_To_Activation_Clamp_VW8_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_ActivationSetPCAddrEnd_1: @@ -15278,14 +15252,14 @@ label_ActivationSetPCAddrEnd_1: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[72:75], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -15294,92 +15268,92 @@ ds_read_b128 v[84:87], v22 offset:16 // load Bias ds_read_b128 v[88:91], v22 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[76:79], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v97, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v97, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[104:107], v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v109, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v109, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[116:119], v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -15484,7 +15458,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15518,7 +15492,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15552,7 +15526,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -15586,7 +15560,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -15620,7 +15594,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -15654,7 +15628,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -15680,106 +15654,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[72:75], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[80:83], v22 offset:0 // load Bias ds_read_b128 v[84:87], v22 offset:16 // load Bias ds_read_b128 v[88:91], v22 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[76:79], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v97, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v97, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[104:107], v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v109, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v109, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[116:119], v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] v_accvgpr_read_b32 v[vgprValuC+25], acc196 // copy acc to vreg[49] v_accvgpr_read_b32 v[vgprValuC+26], acc200 // copy acc to vreg[50] @@ -15884,7 +15858,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15918,7 +15892,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15952,7 +15926,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -15986,7 +15960,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16020,7 +15994,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16054,7 +16028,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16080,106 +16054,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[72:75], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[80:83], v22 offset:0 // load Bias ds_read_b128 v[84:87], v22 offset:16 // load Bias ds_read_b128 v[88:91], v22 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[76:79], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v97, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v97, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[104:107], v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v109, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v109, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[116:119], v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -16284,7 +16258,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -16318,7 +16292,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -16352,7 +16326,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -16386,7 +16360,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16420,7 +16394,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16454,7 +16428,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16480,106 +16454,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[72:75], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[80:83], v22 offset:0 // load Bias ds_read_b128 v[84:87], v22 offset:16 // load Bias ds_read_b128 v[88:91], v22 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[76:79], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v97, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v97, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[104:107], v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v109, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v109, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[116:119], v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] @@ -16684,7 +16658,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -16718,7 +16692,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -16752,7 +16726,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -16786,7 +16760,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16820,7 +16794,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16854,7 +16828,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16880,106 +16854,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[72:75], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[80:83], v22 offset:0 // load Bias ds_read_b128 v[84:87], v22 offset:16 // load Bias ds_read_b128 v[88:91], v22 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[76:79], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v97, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v97, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v4, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[104:107], v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v109, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v109, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[116:119], v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -17084,7 +17058,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -17118,7 +17092,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -17152,7 +17126,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -17186,7 +17160,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -17220,7 +17194,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -17254,7 +17228,7 @@ v_pk_add_f32 v[8:9], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -17280,38 +17254,38 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v21, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[40:43], v21, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE ds_read_b128 v[48:51], v22 offset:0 // load Bias ds_read_b128 v[52:55], v22 offset:16 // load Bias ds_read_b128 v[56:59], v22 offset:1024 // load scaleAlpha ds_read_b128 v[60:63], v22 offset:1040 // load scaleAlpha v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v23, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[44:47], v23, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] v_accvgpr_read_b32 v[vgprValuC+25], acc199 // copy acc to vreg[241] v_accvgpr_read_b32 v[vgprValuC+26], acc203 // copy acc to vreg[242] @@ -17368,7 +17342,7 @@ v_pk_add_f32 v[8:9], v[48:49], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[50:51], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[52:53], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[54:55], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -17402,7 +17376,7 @@ v_pk_add_f32 v[8:9], v[48:49], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[50:51], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[52:53], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[54:55], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -17421,28 +17395,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_1_edge_1 // Branch if true label_To_Activation_None_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Gelu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Relu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Silu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd +label_To_Activation_Clamp_VW1_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_ActivationSetPCAddrEnd: @@ -17458,506 +17440,506 @@ label_ActivationSetPCAddrEnd: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v4, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v8, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v8, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v8, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v8, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v4, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+22], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+23], acc8 // copy acc to vreg[2] @@ -18025,7 +18007,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18033,7 +18015,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18041,7 +18023,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18049,7 +18031,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18057,7 +18039,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18065,7 +18047,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18073,7 +18055,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18081,7 +18063,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18089,7 +18071,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18097,7 +18079,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18105,7 +18087,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18113,7 +18095,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18121,7 +18103,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18129,7 +18111,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18137,7 +18119,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18145,7 +18127,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18153,7 +18135,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18161,7 +18143,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18169,7 +18151,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18177,7 +18159,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18185,7 +18167,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18193,7 +18175,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18201,7 +18183,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18209,7 +18191,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18217,7 +18199,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18225,7 +18207,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18233,7 +18215,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18241,7 +18223,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18249,7 +18231,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18257,7 +18239,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18265,7 +18247,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18273,7 +18255,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18281,7 +18263,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18289,7 +18271,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18297,7 +18279,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18305,7 +18287,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18321,504 +18303,504 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v8, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v8, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v8, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v8, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v8, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc144 // copy acc to vreg[36] v_accvgpr_read_b32 v[vgprValuC+22], acc148 // copy acc to vreg[37] v_accvgpr_read_b32 v[vgprValuC+23], acc152 // copy acc to vreg[38] @@ -18886,7 +18868,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18894,7 +18876,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18902,7 +18884,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18910,7 +18892,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18918,7 +18900,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18926,7 +18908,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18934,7 +18916,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18942,7 +18924,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18950,7 +18932,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18958,7 +18940,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18966,7 +18948,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18974,7 +18956,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18982,7 +18964,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18990,7 +18972,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18998,7 +18980,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19006,7 +18988,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19014,7 +18996,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19022,7 +19004,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19030,7 +19012,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19038,7 +19020,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19046,7 +19028,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19054,7 +19036,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19062,7 +19044,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19070,7 +19052,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19078,7 +19060,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19086,7 +19068,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19094,7 +19076,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19102,7 +19084,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19110,7 +19092,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19118,7 +19100,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19126,7 +19108,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19134,7 +19116,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19142,7 +19124,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19150,7 +19132,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19158,7 +19140,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19166,7 +19148,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19186,504 +19168,504 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v4, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v8, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v8, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v8, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v8, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v4, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc33 // copy acc to vreg[72] v_accvgpr_read_b32 v[vgprValuC+22], acc37 // copy acc to vreg[73] v_accvgpr_read_b32 v[vgprValuC+23], acc41 // copy acc to vreg[74] @@ -19751,7 +19733,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19759,7 +19741,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19767,7 +19749,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19775,7 +19757,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19783,7 +19765,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19791,7 +19773,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19799,7 +19781,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19807,7 +19789,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19815,7 +19797,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19823,7 +19805,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19831,7 +19813,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19839,7 +19821,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19847,7 +19829,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19855,7 +19837,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19863,7 +19845,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19871,7 +19853,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19879,7 +19861,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19887,7 +19869,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19895,7 +19877,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19903,7 +19885,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19911,7 +19893,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19919,7 +19901,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19927,7 +19909,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19935,7 +19917,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19943,7 +19925,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19951,7 +19933,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19959,7 +19941,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19967,7 +19949,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19975,7 +19957,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19983,7 +19965,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19991,7 +19973,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19999,7 +19981,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20007,7 +19989,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20015,7 +19997,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20023,7 +20005,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20031,7 +20013,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20047,504 +20029,504 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v8, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v8, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v8, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v8, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v8, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc177 // copy acc to vreg[108] v_accvgpr_read_b32 v[vgprValuC+22], acc181 // copy acc to vreg[109] v_accvgpr_read_b32 v[vgprValuC+23], acc185 // copy acc to vreg[110] @@ -20612,7 +20594,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20620,7 +20602,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20628,7 +20610,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20636,7 +20618,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20644,7 +20626,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20652,7 +20634,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20660,7 +20642,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20668,7 +20650,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20676,7 +20658,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20684,7 +20666,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20692,7 +20674,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20700,7 +20682,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20708,7 +20690,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20716,7 +20698,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20724,7 +20706,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20732,7 +20714,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20740,7 +20722,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20748,7 +20730,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20756,7 +20738,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20764,7 +20746,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20772,7 +20754,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20780,7 +20762,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20788,7 +20770,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20796,7 +20778,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20804,7 +20786,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20812,7 +20794,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20820,7 +20802,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20828,7 +20810,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20836,7 +20818,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20844,7 +20826,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20852,7 +20834,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20860,7 +20842,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20868,7 +20850,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20876,7 +20858,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20884,7 +20866,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20892,7 +20874,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20912,504 +20894,504 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v4, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v8, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v8, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v8, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v8, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v4, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc66 // copy acc to vreg[144] v_accvgpr_read_b32 v[vgprValuC+22], acc70 // copy acc to vreg[145] v_accvgpr_read_b32 v[vgprValuC+23], acc74 // copy acc to vreg[146] @@ -21477,7 +21459,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21485,7 +21467,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21493,7 +21475,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21501,7 +21483,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21509,7 +21491,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21517,7 +21499,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21525,7 +21507,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21533,7 +21515,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21541,7 +21523,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21549,7 +21531,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21557,7 +21539,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21565,7 +21547,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21573,7 +21555,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21581,7 +21563,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21589,7 +21571,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21597,7 +21579,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21605,7 +21587,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21613,7 +21595,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21621,7 +21603,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21629,7 +21611,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21637,7 +21619,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21645,7 +21627,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21653,7 +21635,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21661,7 +21643,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21669,7 +21651,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21677,7 +21659,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21685,7 +21667,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21693,7 +21675,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21701,7 +21683,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21709,7 +21691,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21717,7 +21699,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21725,7 +21707,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21733,7 +21715,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21741,7 +21723,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21749,7 +21731,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21757,7 +21739,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21773,504 +21755,504 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v8, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v8, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v8, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v8, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v8, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc210 // copy acc to vreg[180] v_accvgpr_read_b32 v[vgprValuC+22], acc214 // copy acc to vreg[181] v_accvgpr_read_b32 v[vgprValuC+23], acc218 // copy acc to vreg[182] @@ -22338,7 +22320,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22346,7 +22328,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22354,7 +22336,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22362,7 +22344,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22370,7 +22352,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22378,7 +22360,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22386,7 +22368,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22394,7 +22376,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22402,7 +22384,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22410,7 +22392,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22418,7 +22400,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22426,7 +22408,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22434,7 +22416,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22442,7 +22424,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22450,7 +22432,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22458,7 +22440,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22466,7 +22448,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22474,7 +22456,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22482,7 +22464,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22490,7 +22472,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22498,7 +22480,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22506,7 +22488,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22514,7 +22496,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22522,7 +22504,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22530,7 +22512,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22538,7 +22520,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22546,7 +22528,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22554,7 +22536,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22562,7 +22544,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22570,7 +22552,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22578,7 +22560,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22586,7 +22568,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22594,7 +22576,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22602,7 +22584,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22610,7 +22592,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22618,7 +22600,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22638,504 +22620,504 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v60, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v57, v60, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v61, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v61, v4, s74 v_lshlrev_b32 v61, 0x2, v61 // Bias address scaled by BPE ds_read_b32 v58, v61 offset:0 // load Bias ds_read_b32 v59, v61 offset:1024 // load scaleAlpha v_add_lshl_u32 v60, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v60, v16, v60, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v60, v16, v60, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v65, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v62, v65, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v66, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v66, v8, s74 v_lshlrev_b32 v66, 0x2, v66 // Bias address scaled by BPE ds_read_b32 v63, v66 offset:0 // load Bias ds_read_b32 v64, v66 offset:1024 // load scaleAlpha v_add_lshl_u32 v65, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v65, v16, v65, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v65, v16, v65, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v70, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v67, v70, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v8, s74 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v16, v70, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v70, v16, v70, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v75, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v72, v75, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v80, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v77, v80, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v8, s74 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v16, v80, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v80, v16, v80, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v85, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v82, v85, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE ds_read_b32 v83, v86 offset:0 // load Bias ds_read_b32 v84, v86 offset:1024 // load scaleAlpha v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v90, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v87, v90, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v8, s74 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE ds_read_b32 v88, v91 offset:0 // load Bias ds_read_b32 v89, v91 offset:1024 // load scaleAlpha v_add_lshl_u32 v90, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v16, v90, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v90, v16, v90, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v92, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE ds_read_b32 v93, v96 offset:0 // load Bias ds_read_b32 v94, v96 offset:1024 // load scaleAlpha v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v8, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v8, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v149, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v148, v149, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s74 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v16, v149, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v149, v16, v149, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v152, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v151, v152, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v155, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v154, v155, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v8, s74 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v16, v155, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v155, v16, v155, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v158, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v157, v158, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v161, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v160, v161, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v8, s74 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v16, v161, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v161, v16, v161, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v164, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v163, v164, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v167, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v166, v167, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v8, s74 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v16, v167, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v167, v16, v167, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v170, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v169, v170, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v171, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v171, v8, s74 v_lshlrev_b32 v171, 0x2, v171 // Bias address scaled by BPE v_add_lshl_u32 v170, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v170, v16, v170, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v170, v16, v170, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v173, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v172, v173, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v174, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v174, v4, s74 v_lshlrev_b32 v174, 0x2, v174 // Bias address scaled by BPE v_add_lshl_u32 v173, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v173, v16, v173, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v173, v16, v173, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v176, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v175, v176, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v177, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v177, v8, s74 v_lshlrev_b32 v177, 0x2, v177 // Bias address scaled by BPE v_add_lshl_u32 v176, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v176, v16, v176, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v176, v16, v176, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v179, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v178, v179, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v180, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v180, v8, s74 v_lshlrev_b32 v180, 0x2, v180 // Bias address scaled by BPE v_add_lshl_u32 v179, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v179, v16, v179, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v179, v16, v179, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v182, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v181, v182, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v183, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v183, v8, s74 v_lshlrev_b32 v183, 0x2, v183 // Bias address scaled by BPE v_add_lshl_u32 v182, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v182, v16, v182, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v182, v16, v182, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc99 // copy acc to vreg[216] v_accvgpr_read_b32 v[vgprValuC+22], acc103 // copy acc to vreg[217] v_accvgpr_read_b32 v[vgprValuC+23], acc107 // copy acc to vreg[218] @@ -23203,7 +23185,7 @@ v_mul_f32 v[vgprValuC+21], v59, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v57 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23211,7 +23193,7 @@ v_mul_f32 v[vgprValuC+22], v64, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v62 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23219,7 +23201,7 @@ v_mul_f32 v[vgprValuC+23], v69, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v67 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23227,7 +23209,7 @@ v_mul_f32 v[vgprValuC+24], v74, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v72 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23235,7 +23217,7 @@ v_mul_f32 v[vgprValuC+25], v79, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v77 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23243,7 +23225,7 @@ v_mul_f32 v[vgprValuC+26], v84, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v82 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23251,7 +23233,7 @@ v_mul_f32 v[vgprValuC+27], v89, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v87 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23259,7 +23241,7 @@ v_mul_f32 v[vgprValuC+28], v94, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v92 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23267,7 +23249,7 @@ v_mul_f32 v[vgprValuC+29], v59, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23275,7 +23257,7 @@ v_mul_f32 v[vgprValuC+30], v64, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23283,7 +23265,7 @@ v_mul_f32 v[vgprValuC+31], v69, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23291,7 +23273,7 @@ v_mul_f32 v[vgprValuC+32], v74, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23299,7 +23281,7 @@ v_mul_f32 v[vgprValuC+33], v79, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23307,7 +23289,7 @@ v_mul_f32 v[vgprValuC+34], v84, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23315,7 +23297,7 @@ v_mul_f32 v[vgprValuC+35], v89, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23323,7 +23305,7 @@ v_mul_f32 v[vgprValuC+36], v94, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23331,7 +23313,7 @@ v_mul_f32 v[vgprValuC+37], v59, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23339,7 +23321,7 @@ v_mul_f32 v[vgprValuC+38], v64, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23347,7 +23329,7 @@ v_mul_f32 v[vgprValuC+39], v69, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23355,7 +23337,7 @@ v_mul_f32 v[vgprValuC+40], v74, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23363,7 +23345,7 @@ v_mul_f32 v[vgprValuC+41], v79, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23371,7 +23353,7 @@ v_mul_f32 v[vgprValuC+42], v84, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23379,7 +23361,7 @@ v_mul_f32 v[vgprValuC+43], v89, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23387,7 +23369,7 @@ v_mul_f32 v[vgprValuC+44], v94, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23395,7 +23377,7 @@ v_mul_f32 v[vgprValuC+45], v59, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v148 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23403,7 +23385,7 @@ v_mul_f32 v[vgprValuC+46], v64, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v151 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23411,7 +23393,7 @@ v_mul_f32 v[vgprValuC+47], v69, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v154 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23419,7 +23401,7 @@ v_mul_f32 v[vgprValuC+48], v74, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v157 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23427,7 +23409,7 @@ v_mul_f32 v[vgprValuC+49], v79, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v160 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v78, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23435,7 +23417,7 @@ v_mul_f32 v[vgprValuC+50], v84, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v163 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v83, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23443,7 +23425,7 @@ v_mul_f32 v[vgprValuC+51], v89, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v166 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v88, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23451,7 +23433,7 @@ v_mul_f32 v[vgprValuC+52], v94, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v169 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v93, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23459,7 +23441,7 @@ v_mul_f32 v[vgprValuC+53], v59, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v172 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v58, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23467,7 +23449,7 @@ v_mul_f32 v[vgprValuC+54], v64, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v175 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v63, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23475,7 +23457,7 @@ v_mul_f32 v[vgprValuC+55], v69, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v68, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23483,7 +23465,7 @@ v_mul_f32 v[vgprValuC+56], v74, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v73, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23499,64 +23481,64 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v28, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v28, v16, v28, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v28, v16, v28, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v25, v28, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v29, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v29, v8, s74 v_lshlrev_b32 v29, 0x2, v29 // Bias address scaled by BPE ds_read_b32 v26, v29 offset:0 // load Bias ds_read_b32 v27, v29 offset:1024 // load scaleAlpha v_add_lshl_u32 v28, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v28, v16, v28, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v28, v16, v28, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v33, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v33, v16, v33, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v33, v16, v33, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v30, v33, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v34, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v34, v8, s74 v_lshlrev_b32 v34, 0x2, v34 // Bias address scaled by BPE ds_read_b32 v31, v34 offset:0 // load Bias ds_read_b32 v32, v34 offset:1024 // load scaleAlpha v_add_lshl_u32 v33, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v33, v16, v33, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v33, v16, v33, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v38, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v38, v16, v38, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v38, v16, v38, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v35, v38, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v39, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v39, v8, s74 v_lshlrev_b32 v39, 0x2, v39 // Bias address scaled by BPE ds_read_b32 v36, v39 offset:0 // load Bias ds_read_b32 v37, v39 offset:1024 // load scaleAlpha v_add_lshl_u32 v38, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v38, v16, v38, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v38, v16, v38, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v43, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v43, v16, v43, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v43, v16, v43, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v40, v43, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v44, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v44, v8, s74 v_lshlrev_b32 v44, 0x2, v44 // Bias address scaled by BPE ds_read_b32 v41, v44 offset:0 // load Bias ds_read_b32 v42, v44 offset:1024 // load scaleAlpha v_add_lshl_u32 v43, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v43, v16, v43, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v43, v16, v43, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+21], acc243 // copy acc to vreg[252] v_accvgpr_read_b32 v[vgprValuC+22], acc247 // copy acc to vreg[253] v_accvgpr_read_b32 v[vgprValuC+23], acc251 // copy acc to vreg[254] @@ -23576,7 +23558,7 @@ v_mul_f32 v[vgprValuC+21], v27, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v25 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v26, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23584,7 +23566,7 @@ v_mul_f32 v[vgprValuC+22], v32, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v30 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v31, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v33, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23592,7 +23574,7 @@ v_mul_f32 v[vgprValuC+23], v37, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v35 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v36, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23600,14 +23582,14 @@ v_mul_f32 v[vgprValuC+24], v42, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v8, v40 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v8, v41, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_Activation_None_VW8: -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Gelu_VW8: v_mul_f32 v16, 0x3d372713, v8 // k1 * x v_fma_f32 v16, v8, v16, 1.0 // 1 + (k1 * x * x) @@ -23705,7 +23687,7 @@ s_nop 0 // 1 wait states v_fma_f32 v16, -2.0, v16, 2.0 // ( + 1 (fused)) v_mul_f32 v16, v15, v16 // x * (1 + tanh(...)) v_mul_f32 v15, 0.5, v16 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Relu_VW8: v_max_f32 v8, v8, 0 // x = max(0, x) v_max_f32 v9, v9, 0 // x = max(0, x) @@ -23715,7 +23697,7 @@ v_max_f32 v12, v12, 0 // x = max(0, x) v_max_f32 v13, v13, 0 // x = max(0, x) v_max_f32 v14, v14, 0 // x = max(0, x) v_max_f32 v15, v15, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Silu_VW8: v_mul_f32 v16, -1.4426950408889634, v8 // (fused -1.442695) v_exp_f32 v16, v16 // exp step 2 @@ -23773,9 +23755,27 @@ v_add_f32 v16, 1.0, v16 // 1 + exp(-x) v_rcp_f32 v16, v16 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v15, v15, v16 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] +label_Activation_Clamp_VW8: +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +v_min_f32 v9, s[sgpractivationBeta], v9 // min(x, beta) +v_max_f32 v9, s[sgpractivationAlpha], v9 // max(alpha, min(x, beta)) +v_min_f32 v10, s[sgpractivationBeta], v10 // min(x, beta) +v_max_f32 v10, s[sgpractivationAlpha], v10 // max(alpha, min(x, beta)) +v_min_f32 v11, s[sgpractivationBeta], v11 // min(x, beta) +v_max_f32 v11, s[sgpractivationAlpha], v11 // max(alpha, min(x, beta)) +v_min_f32 v12, s[sgpractivationBeta], v12 // min(x, beta) +v_max_f32 v12, s[sgpractivationAlpha], v12 // max(alpha, min(x, beta)) +v_min_f32 v13, s[sgpractivationBeta], v13 // min(x, beta) +v_max_f32 v13, s[sgpractivationAlpha], v13 // max(alpha, min(x, beta)) +v_min_f32 v14, s[sgpractivationBeta], v14 // min(x, beta) +v_max_f32 v14, s[sgpractivationAlpha], v14 // max(alpha, min(x, beta)) +v_min_f32 v15, s[sgpractivationBeta], v15 // min(x, beta) +v_max_f32 v15, s[sgpractivationAlpha], v15 // max(alpha, min(x, beta)) +s_setpc_b64 s[64:65] label_Activation_None_VW1: -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Gelu_VW1: v_mul_f32 v16, 0x3d372713, v8 // k1 * x v_fma_f32 v16, v8, v16, 1.0 // 1 + (k1 * x * x) @@ -23789,10 +23789,10 @@ s_nop 0 // 1 wait states v_fma_f32 v16, -2.0, v16, 2.0 // ( + 1 (fused)) v_mul_f32 v16, v8, v16 // x * (1 + tanh(...)) v_mul_f32 v8, 0.5, v16 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Relu_VW1: v_max_f32 v8, v8, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Silu_VW1: v_mul_f32 v16, -1.4426950408889634, v8 // (fused -1.442695) v_exp_f32 v16, v16 // exp step 2 @@ -23801,7 +23801,11 @@ v_add_f32 v16, 1.0, v16 // 1 + exp(-x) v_rcp_f32 v16, v16 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v8, v8, v16 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] +label_Activation_Clamp_VW1: +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +s_setpc_b64 s[64:65] label_SK_Partials: label_GW_Partials_E0: s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address @@ -24199,23 +24203,23 @@ s_nop 0 // 1 wait state required when s_waitcnt vmcnt(0) // wait for data store s_barrier // store all data before setting flag s_lshl_b32 s8, s[sgprStreamKIdx], 2 // flag offset based on CTA index -v_readfirstlane_b32 s67, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s67, 0 // Check for wave 0 +v_readfirstlane_b32 s64, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s64, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagSet // Skip flag set -s_mov_b32 s67, 1 // flag data -s_store_dword s67, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag +s_mov_b32 s64, 1 // flag data +s_store_dword s64, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag label_SK_SkipFlagSet: s_waitcnt lgkmcnt(0) // wait for flag s_branch label_GW_End // jump to end label_GW_End: s_cmp_ge_u32 s[sgprStreamKIter], s[sgprStreamKIterEnd] // Check if done all StreamK iterations s_cbranch_scc1 label_NoBranch_Y57Y54XUE2DV604X // Only branch on scc0 -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_PersistentLoopStart, 4 // target branch offset -s_abs_i32 s84, s84 // abs offset -s_sub_u32 s82, s82, s84 // sub target branch offset -s_subb_u32 s83, s83, 0 // sub high and carry -s_setpc_b64 s[82:83] // branch to label_PersistentLoopStart +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_PersistentLoopStart, 4 // target branch offset +s_abs_i32 s76, s76 // abs offset +s_sub_u32 s74, s74, s76 // sub target branch offset +s_subb_u32 s75, s75, 0 // sub high and carry +s_setpc_b64 s[74:75] // branch to label_PersistentLoopStart label_NoBranch_Y57Y54XUE2DV604X: label_KernelEnd: s_endpgm // Kernel End diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s index 011315e72fd4..7b98cbb96e02 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s @@ -30,7 +30,7 @@ .text /* Num VGPR =249 */ /* Num AccVGPR=256 */ -/* Num SGPR =111 */ +/* Num SGPR =105 */ /******************************************/ /* Optimizations and Config: */ @@ -230,118 +230,83 @@ amdhsa.kernels: .offset: 116 .value_kind: by_value .value_type: f32 - - .name: MagicNumberProblemNumGroupTiles0 + - .name: ItersPerTile .size: 4 .offset: 120 .value_kind: by_value .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0 + - .name: TotalIters .size: 4 .offset: 124 .value_kind: by_value .value_type: u32 - - .name: ItersPerTile + - .name: SKItersPerWG .size: 4 .offset: 128 .value_kind: by_value .value_type: u32 - - .name: MagicNumberItersPerTile + - .name: skGridAndTiles .size: 4 .offset: 132 .value_kind: by_value .value_type: u32 - - .name: MagicShiftItersPerTile - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumProblemNumGroupTiles0By1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0By1 - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - - .name: TotalIters - .size: 4 - .offset: 148 - .value_kind: by_value - .value_type: u32 - - .name: SKItersPerWG - .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: skGrid - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: skTiles - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - .name: skExtraIters .size: 4 - .offset: 164 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: AddressScaleA .size: 8 - .offset: 168 + .offset: 140 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: AddressScaleB .size: 8 - .offset: 176 + .offset: 148 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: AddressScaleAlphaVec .size: 8 - .offset: 184 + .offset: 156 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: bias .size: 8 - .offset: 192 + .offset: 164 .value_kind: global_buffer .value_type: void .address_space: generic - .name: biasType .size: 4 - .offset: 200 + .offset: 172 .value_kind: by_value .value_type: u32 - .name: StrideBias .size: 4 - .offset: 204 + .offset: 176 .value_kind: by_value .value_type: u32 - .name: activationAlpha .size: 4 - .offset: 208 + .offset: 180 .value_kind: by_value .value_type: f32 - .name: activationBeta .size: 4 - .offset: 212 + .offset: 184 .value_kind: by_value .value_type: f32 - .name: activationType .size: 4 - .offset: 216 + .offset: 188 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 135168 .kernarg_segment_align: 8 - .kernarg_segment_size: 224 + .kernarg_segment_size: 192 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 102 @@ -352,7 +317,6 @@ amdhsa.kernels: ... .end_amdgpu_metadata Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950: -label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA @@ -413,28 +377,21 @@ label_ASM_Start: /// Main body of the asm kernel .set sgprStridesB, 42 .set sgprAlpha, 44 .set sgprBeta, 45 -.set sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgprItersPerTile, 48 -.set sgprMagicNumberItersPerTile, 49 -.set sgprMagicShiftItersPerTile, 50 -.set sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgprMagicShiftProblemNumGroupTiles0By1, 52 -.set sgprTotalIters, 53 -.set sgprSKItersPerWG, 54 -.set sgprskGrid, 55 -.set sgprskTiles, 56 -.set sgprskExtraIters, 57 -.set sgprLocalWriteAddrA, 58 -.set sgprLocalWriteAddrB, 59 -.set sgprSwapA, 60 -.set sgprSwapB, 61 -.set sgprStreamKIdx, 62 -.set sgprStreamKIter, 63 -.set sgprStreamKIterEnd, 64 -.set sgprStreamKLocalStart, 65 -.set sgprStreamKLocalEnd, 66 -.set sgprSrdWS, 68 +.set sgprItersPerTile, 46 +.set sgprTotalIters, 47 +.set sgprSKItersPerWG, 48 +.set sgprskGridAndTiles, 49 +.set sgprskExtraIters, 50 +.set sgprLocalWriteAddrA, 51 +.set sgprLocalWriteAddrB, 52 +.set sgprSwapA, 53 +.set sgprSwapB, 54 +.set sgprStreamKIdx, 55 +.set sgprStreamKIter, 56 +.set sgprStreamKIterEnd, 57 +.set sgprStreamKLocalStart, 58 +.set sgprStreamKLocalEnd, 59 +.set sgprSrdWS, 60 /* Size Assignments */ .set sgprSizeI, sgprSizesFree+0 @@ -515,29 +472,30 @@ label_ASM_Start: /// Main body of the asm kernel /******************************************/ /* Load num of Gemms */ -s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 +s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 /* Load packed kernel args (StaggerU/GSU) */ -s_load_dword s73, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 +s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 /* Load WGM data */ s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 /* Load num of WGs */ -s_load_dword s74, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 s_waitcnt lgkmcnt(0) // load args -s_lshr_b32 s72, s67, 0x1e // Get arg type -s_and_b32 s67, 0x3fffffff, s67 // Get nums of gemm -s_cmp_eq_u32 s72, 0 // Is kernel args +s_lshr_b32 s65, s64, 0x1e // Get arg type +s_and_b32 s64, 0x3fffffff, s64 // Get nums of gemm +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 -s_load_dwordx16 s[36:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_waitcnt lgkmcnt(0) // preload s_branch label_LoadArgsEnd label_HBMArgs: @@ -548,9 +506,7 @@ s_waitcnt lgkmcnt(0) // wait for args to load label_LoadArgsEnd: s_branch label_common_kernel_entry -/* pad 35 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ -s_nop 0 -s_nop 0 +/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ s_nop 0 s_nop 0 s_nop 0 @@ -585,10 +541,10 @@ s_nop 0 s_nop 0 s_nop 0 label_Preload_Offset_Start: -s_and_b32 s67, 0x3fffffff, s2 // Get nums of gemm -s_lshr_b32 s72, s2, 0x1e // Get arg type -s_mov_b32 s73, s3 // Preload internal args -s_cmp_eq_u32 s72, 0 // Is kernel args +s_and_b32 s64, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s65, s2, 0x1e // Get arg type +s_mov_b32 s66, s3 // Preload internal args +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_Preload_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 @@ -596,9 +552,9 @@ s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 -s_load_dwordx8 s[44:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_mov_b64 s[20:21], s[6:7] // move preload data to correct sgpr s_mov_b64 s[22:23], s[8:9] // move preload data to correct sgpr s_mov_b64 s[24:25], s[10:11] // move preload data to correct sgpr @@ -608,90 +564,90 @@ label_Preload_HBMArgs: s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments label_Preload_LoadArgsEnd: s_mov_b32 s[sgprWGM], s4 // Preload internal args2 -s_mov_b32 s74, s5 // Load num of WGs +s_mov_b32 s67, s5 // Load num of WGs label_common_kernel_entry: /// for both preload/non-preload common code s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id -s_and_b32 s[sgprStaggerU], s73, 0xffff0000 // Restore StaggerU related vars +s_and_b32 s[sgprStaggerU], s66, 0xffff0000 // Restore StaggerU related vars s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 -s_mov_b32 s[sgprArgType], s72 +s_mov_b32 s[sgprArgType], s65 s_mov_b32 m0, 0x21000 // LDS clamp at 135168 bytes v_mov_b32 v[vgprSerial], v0 // thread serial id /* remap workgroup to XCCs */ -s_lshr_b32 s80, s[sgprWGM], 0x10 // Get WGMXCC -s_ff1_i32_b32 s80, s80 // Get log(WGMXCC) -s_lshr_b32 s81, s[sgprWGM], 0x16 // Get CU_Count +s_lshr_b32 s72, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s72, s72 // Get log(WGMXCC) +s_lshr_b32 s73, s[sgprWGM], 0x16 // Get CU_Count /* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ -s_cmp_gt_i32 s80, 0 +s_cmp_gt_i32 s72, 0 s_cbranch_scc0 label_skip_WGMXCC /* only remap WGs in the range */ -s_lshr_b32 s77, s74, s80 -s_lshl_b32 s77, s77, s80 -s_cmp_ge_u32 s[sgprWorkGroup0], s77 +s_lshr_b32 s69, s67, s72 +s_lshl_b32 s69, s69, s72 +s_cmp_ge_u32 s[sgprWorkGroup0], s69 s_cbranch_scc1 label_skip_WGMXCC -s_cmp_eq_u32 s81, 0 // CU_Count == 0 ? +s_cmp_eq_u32 s73, 0 // CU_Count == 0 ? s_cbranch_scc0 label_XCCG_nonzero -s_lshr_b32 s77, s[sgprWorkGroup0], s80 -s_bfm_b32 s78, s80, 0 -s_and_b32 s78, s[sgprWorkGroup0], s78 -s_lshr_b32 s79, s74, s80 -s_mul_i32 s78, s78, s79 -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_lshr_b32 s69, s[sgprWorkGroup0], s72 +s_bfm_b32 s70, s72, 0 +s_and_b32 s70, s[sgprWorkGroup0], s70 +s_lshr_b32 s71, s67, s72 +s_mul_i32 s70, s70, s71 +s_add_u32 s[sgprWorkGroup0], s69, s70 s_branch label_skip_WGMXCC label_XCCG_nonzero: /* temp0 = (wg//CU_Count)*CU_Count */ -v_cvt_f32_u32 v4, s81 // wg//CU_Count +v_cvt_f32_u32 v4, s73 // wg//CU_Count v_rcp_iflag_f32 v4, v4 // wg//CU_Count v_cvt_f32_u32 v5, s[sgprWorkGroup0] // wg//CU_Count v_mul_f32 v4, v4, v5 // wg//CU_Count v_cvt_u32_f32 v4, v4 // wg//CU_Count -v_mul_u32_u24 v5, v4, s81 // wg//CU_Count +v_mul_u32_u24 v5, v4, s73 // wg//CU_Count v_sub_u32 v5, s[sgprWorkGroup0], v5 // wg//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // wg//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // wg//CU_Count v_add_u32 v4, 1, v4 // wg//CU_Count v_mov_b32 v5, 0 // wg//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s81 // re-calculate remainder +v_mul_u32_u24 v5, v4, s73 // re-calculate remainder v_sub_u32 v5, s[sgprWorkGroup0], v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s77, v4 // quotient -v_readfirstlane_b32 s78, v5 // remainder -s_mul_i32 s77, s77, s81 +v_readfirstlane_b32 s69, v4 // quotient +v_readfirstlane_b32 s70, v5 // remainder +s_mul_i32 s69, s69, s73 /* temp1 = (wg%CU_Count)//WGMXCC */ -s_lshr_b32 s78, s78, s80 +s_lshr_b32 s70, s70, s72 /* temp0 = temp0 + temp1 */ -s_add_u32 s77, s77, s78 +s_add_u32 s69, s69, s70 /* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ -v_cvt_f32_u32 v4, s81 // WGs//CU_Count +v_cvt_f32_u32 v4, s73 // WGs//CU_Count v_rcp_iflag_f32 v4, v4 // WGs//CU_Count -v_cvt_f32_u32 v5, s74 // WGs//CU_Count +v_cvt_f32_u32 v5, s67 // WGs//CU_Count v_mul_f32 v4, v4, v5 // WGs//CU_Count v_cvt_u32_f32 v4, v4 // WGs//CU_Count -v_mul_u32_u24 v5, v4, s81 // WGs//CU_Count -v_sub_u32 v5, s74, v5 // WGs//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // WGs//CU_Count +v_mul_u32_u24 v5, v4, s73 // WGs//CU_Count +v_sub_u32 v5, s67, v5 // WGs//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // WGs//CU_Count v_add_u32 v4, 1, v4 // WGs//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s78, v4 // quotient -s_mul_i32 s78, s78, s81 -s_sub_u32 s79, s74, s78 -s_cmp_gt_u32 s[sgprWorkGroup0], s78 -s_cselect_b32 s78, s79, s81 -s_lshr_b32 s78, s78, s80 -s_bfm_b32 s79, s80, 0 -s_and_b32 s79, s[sgprWorkGroup0], s79 -s_mul_i32 s78, s78, s79 +v_readfirstlane_b32 s70, v4 // quotient +s_mul_i32 s70, s70, s73 +s_sub_u32 s71, s67, s70 +s_cmp_gt_u32 s[sgprWorkGroup0], s70 +s_cselect_b32 s70, s71, s73 +s_lshr_b32 s70, s70, s72 +s_bfm_b32 s71, s72, 0 +s_and_b32 s71, s[sgprWorkGroup0], s71 +s_mul_i32 s70, s70, s71 /* WorkGroup0 = temp0 + temp1 */ -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_add_u32 s[sgprWorkGroup0], s69, s70 label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap -s_cmp_eq_u32 s72, 0 +s_cmp_eq_u32 s65, 0 s_cbranch_scc0 label_MultiGemm /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -721,97 +677,98 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args s_branch label_MultiGemmEnd label_MultiGemm: /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_IsExternalValid // branch if ArgType == 2 -s_mov_b32 s11, 204 -s_mul_i32 s78, s67, 4 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 176 +s_mul_i32 s72, s64, 4 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] s_branch label_IsExternalValidEnd label_IsExternalValid: -s_mov_b32 s11, 244 -s_mov_b32 s78, 0 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 216 +s_mov_b32 s72, 0 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] label_IsExternalValidEnd: /* Grouped Gemm:: prefetch 1 arg load */ s_mov_b32 s10, 1 -s_mov_b32 s79, 0 -s_load_dwordx4 s[20:23], s[72:73], s78 -s_cmpk_eq_u32 s67, 1 // if gemm_count is 1? +s_mov_b32 s73, 0 +s_load_dwordx4 s[20:23], s[66:67], s72 +s_cmpk_eq_u32 s64, 1 // if gemm_count is 1? s_cbranch_scc1 label_wgTable_noLoadLoop /* Grouped Gemm:: accumulate numTiles for each gemm */ /* Grouped Gemm:: loop start */ label_Loop_GemmCount: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 -s_cmp_lt_u32 s[sgprWorkGroup0], s79 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 +s_cmp_lt_u32 s[sgprWorkGroup0], s73 s_cbranch_scc1 label_FOUND -s_add_u32 s78, s78, s11 -s_load_dwordx4 s[20:23], s[72:73], s78 +s_add_u32 s72, s72, s11 +s_load_dwordx4 s[20:23], s[66:67], s72 s_add_u32 s10, s10, 1 -s_cmp_lt_u32 s10, s67 +s_cmp_lt_u32 s10, s64 s_cbranch_scc1 label_Loop_GemmCount /* Grouped Gemm:: noLoadLoop */ label_wgTable_noLoadLoop: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 /* Grouped Gemm:: gemmIndex found */ label_FOUND: -s_sub_u32 s73, s10, 1 -s_sub_u32 s72, s79, s76 -s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s72 +s_sub_u32 s67, s10, 1 +s_sub_u32 s66, s73, s70 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalStruct // branch if ArgType == 2 /* Grouped Gemm: offset argument address to gemm */ /* Grouped Gemm: offset address from wg_table_start to args_start */ -s_lshl2_add_u32 s[sgprKernArgAddress], s67, s[sgprKernArgAddress] +s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress] s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 204 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 176 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_branch label_LoadExternalStructEnd label_LoadExternalStruct: /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 244 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 216 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dword s56, s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 // Read Beta -s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 160 // 160 +s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132 label_LoadExternalStructEnd: /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -841,7 +798,7 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args /* Early stop if N(SizeFreeJ) == 0 */ s_cmp_eq_u32 s[sgprSizeJ], 0 @@ -851,25 +808,17 @@ s_endpgm label_NoEarlyStop_N0: label_MultiGemmEnd: -.set sgprSrdA, 72 -.set sgprSrdB, 76 -.set sgprShadowLimitA, 80 -.set sgprShadowLimitB, 82 -.set sgprStaggerUIter, 67 -.set sgprWrapUA, sgprKernArgAddress -.set sgprWrapUB, 84 -.set sgprGlobalReadIncsA, 86 -.set sgprGlobalReadIncsB, 87 -.set sgprScalarGlobalReadOffsetA, 88 -.set sgprScalarGlobalReadOffsetB, 95 - -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 +.set sgprSrdA, 64 +.set sgprSrdB, 68 +.set sgprShadowLimitA, 72 +.set sgprShadowLimitB, 74 +.set sgprStaggerUIter, 76 +.set sgprWrapUA, 77 +.set sgprWrapUB, 79 +.set sgprGlobalReadIncsA, 81 +.set sgprGlobalReadIncsB, 82 +.set sgprScalarGlobalReadOffsetA, 83 +.set sgprScalarGlobalReadOffsetB, 90 s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift @@ -883,28 +832,30 @@ label_AlphaNonZero: s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0] // Save original StreamK index s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do) s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do) -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_cmp_lt_u32 s[sgpr104], s[sgprTotalIters] // Check if there are DP tiles to do +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_cmp_lt_u32 s97, s[sgprTotalIters] // Check if there are DP tiles to do s_cbranch_scc1 label_SK_InitDone // Done init s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr105], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr104], s[sgprStreamKIdx], s[sgpr105] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr105], s[sgpr104], s[sgpr105] // StreamK ending iteration (case: before extra iters) +s_add_u32 s98, s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s97, s[sgprStreamKIdx], s98 // StreamK starting iteration (case: before extra iters) +s_add_u32 s98, s97, s98 // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr104], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr105], s[sgprStreamKIterEnd] // Set end iter -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr104] // Cap ending iter at total SK iters +s_cselect_b32 s[sgprStreamKIter], s97, s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s98, s[sgprStreamKIterEnd] // Set end iter +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s97 // Cap ending iter at total SK iters label_SK_InitDone: s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc0 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_KernelEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_KernelEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_KernelEnd label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ @@ -913,14 +864,10 @@ label_NoBranch_T8JHFHKM7BO5OHXW: label_PersistentLoopStart: // Use sgprScalarGlobalReadOffsetA/B sgprs -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - +.set sgpr102, 84 +.set sgpr103, 85 +.set sgpr104, 86 + /******************************************/ /* Begin setupNewTile */ /******************************************/ @@ -938,78 +885,106 @@ v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v4 // Set LRA to first b v_xor_b32 v4, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v4 // Set LRA to first buffer offset /* StreamK calculate tile idx and map to WG */ -s_mul_hi_u32 s[sgpr105], s[sgprStreamKIter], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s[sgpr104], s[sgprStreamKIter], s[sgpr106] // s_magic mul, div alg 2 -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_and_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr104], s[sgpr104], s[sgpr106] // sMagicDiv Alg 2 -s_mul_i32 s[sgpr105], s[sgpr104], s[sgprItersPerTile] // Tile start iteration -s_add_u32 s[sgpr106], s[sgpr105], s[sgprItersPerTile] // Tile end iteration -s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s[sgpr105] // Local iteration start -s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s[sgpr106] // 1. (Local) iteration end (SK tile) -s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s[sgpr105] // 2. Local iteration end (SK tile) -s_mul_i32 s[sgpr107], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_sub_u32 s[sgpr107], s[sgprTotalIters], s[sgpr107] // Offset to first SK tile -s_mul_i32 s[sgpr105], s[sgprskGrid], s[sgprItersPerTile] // DP iterations shift -s_add_u32 s[sgpr105], s[sgpr105], s[sgprStreamKIter] // Add DP shift -s_cmp_lt_u32 s[sgpr105], s[sgpr107] // Check if still in DP section +v_cvt_f32_u32 v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_rcp_iflag_f32 v4, v4 // StreamKIter // ItersPerTile +v_cvt_f32_u32 v5, s[sgprStreamKIter] // StreamKIter // ItersPerTile +v_mul_f32 v4, v4, v5 // StreamKIter // ItersPerTile +v_cvt_u32_f32 v4, v4 // StreamKIter // ItersPerTile +v_mul_u32_u24 v5, v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_sub_u32 v5, s[sgprStreamKIter], v5 // StreamKIter // ItersPerTile +v_cmpx_eq_u32 exec, v5, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_add_u32 v4, 1, v4 // StreamKIter // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s98, s[sgprItersPerTile] // Tile start iteration +s_add_u32 s100, s99, s[sgprItersPerTile] // Tile end iteration +s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s99 // Local iteration start +s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s100 // 1. (Local) iteration end (SK tile) +s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s99 // 2. Local iteration end (SK tile) +s_and_b32 s101, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s101, s101, s[sgprItersPerTile] // Total SK iters +s_sub_u32 s101, s[sgprTotalIters], s101 // Offset to first SK tile +s_lshr_b32 s99, s[sgprskGridAndTiles], 0x10 // Get skGrid +s_mul_i32 s99, s99, s[sgprItersPerTile] // DP iterations shift +s_add_u32 s99, s99, s[sgprStreamKIter] // Add DP shift +s_cmp_lt_u32 s99, s101 // Check if still in DP section s_cbranch_scc1 label_SK_UpdateDone // Done update -s_mov_b32 s[sgpr105], s[sgpr106] // SK iterations shift -s_cmp_le_u32 s[sgpr107], s[sgprStreamKIter] // Check if continuing in SK section +s_mov_b32 s99, s100 // SK iterations shift +s_cmp_le_u32 s101, s[sgprStreamKIter] // Check if continuing in SK section s_cbranch_scc1 label_SK_UpdateDone // Done update s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr109], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr108], s[sgprStreamKIdx], s[sgpr109] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr109], s[sgpr108], s[sgpr109] // StreamK ending iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s[sgpr102], s[sgprStreamKIdx], s[sgpr103] // StreamK starting iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgpr102], s[sgpr103] // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr108], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr109], s[sgprStreamKIterEnd] // Set end iter -s_add_u32 s[sgpr105], s[sgprStreamKIter], s[sgpr107] // Offset to start of SK section -s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr107] // Offset to start of SK section +s_cselect_b32 s[sgprStreamKIter], s[sgpr102], s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr103], s[sgprStreamKIterEnd] // Set end iter +s_add_u32 s99, s[sgprStreamKIter], s101 // Offset to start of SK section +s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s101 // Offset to start of SK section s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_KernelEnd +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_KernelEnd, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_KernelEnd label_NoBranch_S4FDBQ587JJL6NOU: label_SK_UpdateDone: -s_mov_b32 s[sgprStreamKIter], s[sgpr105] // Store current iteration +s_mov_b32 s[sgprStreamKIter], s99 // Store current iteration /* Map StreamK tile index to wg0/1/2 */ -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumProblemNumGroupTiles0By1] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup2], s[sgpr105] // wg2 = Tile Idx / problemNumGroupTiles0By1 -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups1] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgpr104], s[sgpr104], s[sgpr105] // remainder -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumberProblemNumGroupTiles0] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup1], s[sgpr105] // wg1 = Tile Idx / problemNumGroupTiles0 -s_mul_i32 s[sgprWorkGroup0], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr104], s[sgprWorkGroup0] // wg0 = Tile Idx % problemNumGroupTiles0 +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles +v_cvt_f32_u32 v4, s99 // TileID // nWG0*nWG1 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0*nWG1 +v_cvt_f32_u32 v5, s98 // TileID // nWG0*nWG1 +v_mul_f32 v4, v4, v5 // TileID // nWG0*nWG1 +v_cvt_u32_f32 v4, v4 // TileID // nWG0*nWG1 +v_mul_u32_u24 v5, v4, s99 // TileID // nWG0*nWG1 +v_sub_u32 v5, s98, v5 // TileID // nWG0*nWG1 +v_cmpx_eq_u32 exec, v5, s99 // TileID // nWG0*nWG1 +v_add_u32 v4, 1, v4 // TileID // nWG0*nWG1 +v_mov_b32 v5, 0 // TileID // nWG0*nWG1 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s99 // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s99 // re-calculate remainder +v_sub_u32 v5, s98, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup2], v4 // quotient +v_readfirstlane_b32 s100, v5 // remainder +v_cvt_f32_u32 v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0 +v_cvt_f32_u32 v5, s100 // TileID // nWG0 +v_mul_f32 v4, v4, v5 // TileID // nWG0 +v_cvt_u32_f32 v4, v4 // TileID // nWG0 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_sub_u32 v5, s100, v5 // TileID // nWG0 +v_cmpx_eq_u32 exec, v5, s[sgprNumWorkGroups0] // TileID // nWG0 +v_add_u32 v4, 1, v4 // TileID // nWG0 +v_mov_b32 v5, 0 // TileID // nWG0 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck // branch if s[Alpha] != 0 s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_GW_End, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_GW_End +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_GW_End, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_GW_End label_NoBranch_UR8VN3A1SJCPC6PO: s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations label_SKAlphaCheck: @@ -1018,130 +993,130 @@ s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? s_cbranch_scc1 label_WGM // branch if WGM >= 0 -s_abs_i32 s[sgpr108], s[sgprWGM] // abs(WGM) -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_abs_i32 s101, s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup0], s[sgpr107] // WorkGroup0=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup0], s100 // WorkGroup0=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s100, s100, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups0], s[sgpr105] // NumWorkGroups0=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups0], s99 // NumWorkGroups0=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup1] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup1] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder -s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr107], s[sgprWorkGroup0] // WorkGroup0=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s100, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s97 // wg1 += blockId * WGM s_branch label_WGM label_WGMPositive: -s_mov_b32 s[sgpr108], s[sgprWGM] // WGM -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_mov_b32 s101, s[sgprWGM] // WGM +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup1], s[sgpr107] // WorkGroup1=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup1], s100 // WorkGroup1=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s100, s100, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups1], s[sgpr105] // NumWorkGroups1=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups1], s99 // NumWorkGroups1=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup0] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup0] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup0], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup1], v5 // remainder -s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup1], s[sgpr107], s[sgprWorkGroup1] // WorkGroup1=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s100, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s97 // wg1 += blockId * WGM label_WGM: /******************************************/ @@ -1175,8 +1150,8 @@ v_lshl_add_u32 v5, v7, 14, v5 // 7. wave offset in M dimen: /* local read addresses: final offsets a */ v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id -s_mov_b32 s[sgpr104], 128 // LSU offset: stride = lsuStride(128) when umlds==True -v_mul_lo_u32 v6, s[sgpr104], v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +s_mov_b32 s97, 128 // LSU offset: stride = lsuStride(128) when umlds==True +v_mul_lo_u32 v6, s97, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) v_add_u32 v[vgprLocalReadAddrA], v6, v4 // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1) v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 32 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 5, v[vgprLocalReadAddrA] // Final Offset: padding 32 per block 1024 @@ -1185,7 +1160,7 @@ v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 5, v[vgprLocalReadAddrA] // Final Offs v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id // LSU offset: stride = lsuStride(128) when umlds==True (dup assign opt.) -v_mul_lo_u32 v4, s[sgpr104], v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_mul_lo_u32 v4, s97, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) v_add_u32 v[vgprLocalReadAddrB], v4, v5 // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1) v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 32 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 5, v[vgprLocalReadAddrB] // Final Offset: padding 32 per block 1024 @@ -1300,108 +1275,80 @@ s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute off s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) // scalar offset *= bytes/element (multiplier is 1, do nothing) -// Use sgprScalarGlobalReadOffsetA sgprs -.set sgpr104, sgprSKItersPerWG // skitersperwg, overwrite, 54 -.set sgpr105, sgprskGrid // skgrid, overwrite, 55 -.set sgpr106, sgprMagicNumberProblemNumGroupTiles0 // sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgpr107, sgprMagicShiftProblemNumGroupTiles0 // sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgpr108, sgprMagicShiftItersPerTile // sgprMagicShiftItersPerTile, 50 -.set sgpr109, sgprMagicNumProblemNumGroupTiles0By1 // sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgpr110, sgprWGM // wgm, 7 - -// Save sgpr values to vgpr -v_writelane_b32 v255, s[sgprSKItersPerWG], 0 -s_nop 0 -v_writelane_b32 v255, s[sgprskGrid], 1 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumberProblemNumGroupTiles0], 2 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftProblemNumGroupTiles0], 3 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftItersPerTile], 4 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumProblemNumGroupTiles0By1], 5 -s_nop 0 -v_writelane_b32 v255, s[sgprWGM], 6 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress], 7 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress+1], 8 - /* global read addresses: addresses a */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideAL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideAL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideAL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideAL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideAL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideAL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeI], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideAL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideAL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideA0I], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideA0I], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s101 // sub tileStart // Set limit to use bytes (byte is 1, do nothing) s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart +s_mul_hi_u32 s99, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart // tileStart *= BPE (multiplier is 1, do nothing) -s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: addresses b */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideBL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideBL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideBL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideBL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideBL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideBL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeJ], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideBL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideBL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideB1J], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideB1J], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s101 // sub tileStart // Set limit to use bytes (byte is 1, do nothing) s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart +s_mul_hi_u32 s99, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart // tileStart *= BPE (multiplier is 1, do nothing) -s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: increments a */ @@ -1415,87 +1362,87 @@ v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck2 // branch if s[Alpha] != 0 s_mov_b32 s[sgprLoopCounterL], 0 // Skip iterations label_SKAlphaCheck2: -s_and_b32 s[sgpr105], 127, s[sgprSizesSum+0] // s[sgpr105] = s[sgprSizesSum+0] % 128 -s_cmp_eq_u32 s[sgpr105], 0 // numIterL == 0 -s_cselect_b32 s[sgpr104], 0, 1 // check if size uses tail loop +s_and_b32 s99, 127, s[sgprSizesSum+0] // s99 = s[sgprSizesSum+0] % 128 +s_cmp_eq_u32 s99, 0 // numIterL == 0 +s_cselect_b32 s98, 0, 1 // check if size uses tail loop s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile -s_cselect_b32 s[sgpr104], s[sgpr104], 0 // this WG runs tail loop -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s[sgpr104] // Adjust loop counter for tail loop +s_cselect_b32 s98, s98, 0 // this WG runs tail loop +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s98 // Adjust loop counter for tail loop s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter -s_and_b32 s[sgpr106], s[sgprStaggerU], 0x1f00 -s_lshr_b32 s[sgpr106], s[sgpr106], 0x8 -s_and_b32 s[sgpr107], s[sgprStaggerU], 0xe000 +s_and_b32 s100, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s100, s100, 0x8 +s_and_b32 s101, s[sgprStaggerU], 0xe000 s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff -s_mov_b32 s[sgpr104], s[sgprStaggerU] // init staggerU +s_mov_b32 s98, s[sgprStaggerU] // init staggerU label_beginStaggerUIter: -s_lshl_b32 s[sgpr105], s[sgpr104], s[sgpr106] // shift by StaggerUStride -s_cmp_ge_u32 s[sgprOrigLoopCounter], s[sgpr105] // loopCount >= current shift Count +s_lshl_b32 s99, s98, s100 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s99 // loopCount >= current shift Count s_cbranch_scc1 label_endStaggerUIter // jump to end -s_lshr_b32 s[sgpr104], s[sgpr104], 1 // step down to smaller stagger +s_lshr_b32 s98, s98, 1 // step down to smaller stagger s_branch label_beginStaggerUIter // jump to begin label_endStaggerUIter: -s_sub_u32 s[sgpr105], s[sgpr104], 1 // staggerU mask -s_cmp_ge_u32 s[sgpr104], 1 // if current staggerU >= 1 -s_cselect_b32 s[sgprStaggerUIter], s[sgpr105], 0 // set Mask -s_cmp_eq_u32 s[sgpr107], 0x0 +s_sub_u32 s99, s98, 1 // staggerU mask +s_cmp_ge_u32 s98, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s99, 0 // set Mask +s_cmp_eq_u32 s101, 0x0 s_cbranch_scc1 label_StaggerUMapping_1 -s_mov_b32 s[sgpr104], s[sgprWorkGroup0] +s_mov_b32 s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_1: -s_cmp_eq_u32 s[sgpr107], 0x2000 +s_cmp_eq_u32 s101, 0x2000 s_cbranch_scc1 label_StaggerUMapping_2 -s_mov_b32 s[sgpr104], s[sgprWorkGroup1] +s_mov_b32 s98, s[sgprWorkGroup1] s_branch label_staggerInputEnd label_StaggerUMapping_2: -s_cmp_eq_u32 s[sgpr107], 0x4000 +s_cmp_eq_u32 s101, 0x4000 s_cbranch_scc1 label_StaggerUMapping_3 -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_StaggerUMapping_3: -s_cmp_eq_u32 s[sgpr107], 0x6000 +s_cmp_eq_u32 s101, 0x6000 s_cbranch_scc1 label_StaggerUMapping_4 -s_mul_i32 s[sgpr105], s[sgprNumWorkGroups0], s[sgprWorkGroup1] -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_add_u32 s[sgpr104], s[sgpr104], s[sgprWorkGroup0] +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s98, s98, s99 +s_add_u32 s98, s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_4: -s_cmp_eq_u32 s[sgpr107], 0x8000 +s_cmp_eq_u32 s101, 0x8000 s_cbranch_scc1 label_staggerInputEnd -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_staggerInputEnd: -s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr104] // Compute actual stagger start for this tile -s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr106] // shift by StaggerUStride +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s98 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s100 // shift by StaggerUStride s_cmp_gt_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles /* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap @@ -1545,26 +1492,26 @@ s_add_u32 m0, m0, 4224 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 /* global read inc A loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* global read inc B loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -1581,28 +1528,28 @@ s_mov_b32 s[sgprSrdC+2], BufferOOB s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s[sgpr106], MT1, s[sgprWorkGroup1] // <- wg1*MT1 -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s[sgpr105] // add hi to SRD - -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgpr105] // add hi to SRD +s_mul_i32 s100, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s99, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s99 // add hi to SRD + +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s99 // add hi to SRD /* initC: remove ValuC vgpr buffer [0...0) from pool */ @@ -1870,11 +1817,11 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? /* after InitC, skip to end of prefetch last iter if numIter==0 */ s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU // Only branch on scc1 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_PrefetchGlobalLastIterEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_PrefetchGlobalLastIterEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_PrefetchGlobalLastIterEnd label_NoBranch_8S4L1KCK9VFC7AQU: s_waitcnt vmcnt(0) // wait for global read s_barrier // For stream-k / persistent loop @@ -1924,6 +1871,7 @@ s_add_u32 m0, m0, 4224 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 s_add_u32 m0, m0, 4224 // Move LDS write address to next line buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + /* local write swap a */ s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR @@ -1954,7 +1902,7 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0 s_waitcnt lgkmcnt(0) - + /******************************************/ /* Unrolled Loop(s) - Begin */ /******************************************/ @@ -1995,15 +1943,15 @@ ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] o v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:1 blgp:0 // left value = acc[32+0:35+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) /* mfmaIndex:9 */ v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:1 blgp:0 // left value = acc[36+0:39+0] -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address @@ -2018,10 +1966,10 @@ s_barrier v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:1 blgp:0 // left value = acc[44+0:47+0] /* global read inc B loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:16 */ v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:1 blgp:0 // left value = acc[64+0:67+0] @@ -2078,8 +2026,8 @@ v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB /* mfmaIndex:6 */ v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:1 blgp:0 // left value = acc[24+0:27+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2332,15 +2280,15 @@ v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:1 blgp:0 // left value = acc[72+0:75+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) /* mfmaIndex:19 */ v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:1 blgp:0 // left value = acc[76+0:79+0] -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2348,15 +2296,15 @@ s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow t v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:1 blgp:0 // left value = acc[96+0:99+0] /* global read inc B loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:25 */ v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:1 blgp:0 // left value = acc[100+0:103+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2762,12 +2710,12 @@ label_PrefetchGlobalLastIterEnd: /******************************************/ /* local write reset offsets a */ -s_xor_b32 s[sgpr104], s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s97 // Set LWA to first buffer offset /* local write reset offsets b */ -s_xor_b32 s[sgpr104], s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s97 // Set LWA to first buffer offset /* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */ .set vgprG2LA_BASE, 4 .set vgprG2LA, vgprG2LA_BASE+0 @@ -2786,57 +2734,56 @@ s_mov_b32 s[sgprOrigLoopCounter], 0 // repurpose to count each lo s_cbranch_scc1 label_SkipTailLoopL // skip to end of tail loop b/c numIter==0 /* remove stagger offsets for tail loop */ -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL label_Negative_J5DQFVGFWLXU2DUR: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_DLSAQLEVYLOBCPNL: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUA] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUA+1] // S - WrapU -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUA] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUA+1] // S - WrapU +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1 -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB label_Negative_LQI6BOBE0EY8XIP1: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_9N1QELR2XL4Z0HRB: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUB] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUB+1] // S - WrapU -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUB] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUB+1] // S - WrapU +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 - // Check if K multiple of 4 -s_and_b32 s[sgpr104], s[sgprSizesSum], 3 -s_cmp_eq_u32 s[sgpr104], 0 +s_and_b32 s98, s[sgprSizesSum], 3 +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_tailloop_non_dtl label_tailloop_dtl: @@ -3376,550 +3323,550 @@ ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x80 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 128 (bpeDS) +s_mov_b32 s97, 0x80 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 128 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 4, v135 // v135 = v135 * 16 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 4, v135 // v135 = v135 * 16 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 31 // get inputs for edge thread -s_sub_u32 s[sgpr106], 32, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 3 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 31 // get inputs for edge thread +s_sub_u32 s97, 32, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 3 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v145, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v145, s[98:99] s_nop 1 v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:1 blgp:0 // left value = acc[0+0:3+0] v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:1 blgp:0 // left value = acc[4+0:7+0] @@ -3992,12 +3939,12 @@ s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x80 // inc counterL s_cmp_le_i32 s[sgprLoopCounterL], 0x0 // counterL<=0 s_cbranch_scc0 label_TailLoopBeginL // restart LoopL label_TailLoopEndL: -s_mov_b32 s[sgpr104], 1 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgpr104] // remove lro damage -s_mov_b32 s[sgpr104], 1 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgpr104] // remove lro damage +s_mov_b32 s97, 1 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s97 // remove lro damage +s_mov_b32 s97, 1 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s97 // remove lro damage label_SkipTailLoopL: .set vgprValuA_X0_I0_BASE, UNDEF .set vgprValuA_X0_I0, UNDEF @@ -4008,11 +3955,11 @@ label_SkipTailLoopL: label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprLoopCounterL, UNDEF .set sgprOrigLoopCounter, UNDEF -.set sgprStaggerUIter, UNDEF .set sgprSrdA, UNDEF .set sgprSrdB, UNDEF .set sgprShadowLimitA, UNDEF .set sgprShadowLimitB, UNDEF +.set sgprStaggerUIter, UNDEF .set sgprWrapUA, UNDEF .set sgprWrapUB, UNDEF .set sgprGlobalReadIncsA, UNDEF @@ -4020,58 +3967,31 @@ label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprScalarGlobalReadOffsetA, UNDEF .set sgprScalarGlobalReadOffsetB, UNDEF /* load store sgprs */ -.set sgprAddressScaleA, 72 -.set sgprAddressScaleB, 74 -.set sgprAddressScaleAlphaVec, 76 -.set sgprAddressBias, 78 -.set sgprBiasType, 80 -.set sgprBiasStride, 81 -.set sgpractivationAlpha, 82 -.set sgpractivationBeta, 83 -.set sgprActivationType, 84 - -v_readlane_b32 s[sgprSKItersPerWG], v255, 0 -s_nop 0 -v_readlane_b32 s[sgprskGrid], v255, 1 -s_nop 0 -v_readlane_b32 s[sgprMagicNumberProblemNumGroupTiles0], v255, 2 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftProblemNumGroupTiles0], v255, 3 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftItersPerTile], v255, 4 -s_nop 0 -v_readlane_b32 s[sgprMagicNumProblemNumGroupTiles0By1], v255, 5 -s_nop 0 -v_readlane_b32 s[sgprWGM], v255, 6 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress], v255, 7 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress+1], v255, 8 - -.set sgpr104, UNDEF -.set sgpr105, UNDEF -.set sgpr106, UNDEF -.set sgpr107, UNDEF -.set sgpr108, UNDEF -.set sgpr109, UNDEF -.set sgpr110, UNDEF - +.set sgprAddressScaleA, 64 +.set sgprAddressScaleB, 66 +.set sgprAddressScaleAlphaVec, 68 +.set sgprAddressBias, 70 +.set sgprBiasType, 72 +.set sgprBiasStride, 73 +.set sgpractivationAlpha, 74 +.set sgpractivationBeta, 75 +.set sgprActivationType, 76 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalEpilogueStruct // branch if ArgType == 2 -s_load_dwordx8 s[72:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 152 // 152 -s_load_dwordx4 s[80:83], s[sgprKernArgAddress:sgprKernArgAddress+1], 184 // 184 -s_load_dword s84, s[sgprKernArgAddress:sgprKernArgAddress+1], 200 // 200 +s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124 +s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156 +s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 172 // 172 s_branch label_LoadExternalEpilogueStructEnd label_LoadExternalEpilogueStruct: -s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 176 // 176 -s_load_dwordx4 s[76:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 208 // 208 -s_load_dwordx2 s[80:81], s[sgprKernArgAddress:sgprKernArgAddress+1], 224 // 224 -s_load_dwordx2 s[82:83], s[sgprKernArgAddress:sgprKernArgAddress+1], 248 // 248 -s_load_dword s84, s[sgprKernArgAddress:sgprKernArgAddress+1], 256 // 256 +s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 148 // 148 +s_load_dwordx4 s[68:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180 +s_load_dwordx2 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196 +s_load_dwordx2 s[74:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220 +s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228 label_LoadExternalEpilogueStructEnd: -.set sgprSrdScaleAlphaVec, 88 -.set sgprSrdBias, 92 +.set sgprSrdScaleAlphaVec, 80 +.set sgprSrdBias, 84 /* Mapping of Acc register -> C Vgpr register */ @@ -4122,10 +4042,10 @@ s_mov_b32 s[sgprSrdScaleAlphaVec+2], s[sgprSizeI] label_ScaleAlphaVecAddrValid_End: s_mul_i32 s[sgprSrdScaleAlphaVec+2], 0x4, s[sgprSrdScaleAlphaVec+2] // ScaleAlphaVec scaled by BPE -s_add_u32 s67, s[sgprWorkGroup2], 0x1 -s_mul_i32 s67, s[sgprBiasStride], s67 // stride * (wg+1) -s_cmp_eq_u32 s67, 0 // bias stride = 0? -s_cselect_b32 s67, s[sgprSizeI], s67 +s_add_u32 s77, s[sgprWorkGroup2], 0x1 +s_mul_i32 s77, s[sgprBiasStride], s77 // stride * (wg+1) +s_cmp_eq_u32 s77, 0 // bias stride = 0? +s_cselect_b32 s77, s[sgprSizeI], s77 s_mov_b64 s[sgprSrdBias+0:sgprSrdBias+0+1], s[sgprAddressBias+0:sgprAddressBias+0+1] // init SRD base address s_mov_b32 s[sgprSrdBias+3], Srd127_96 // Set bits 127_96 in post-loop SRD s_cmp_eq_u64 s[sgprAddressBias:sgprAddressBias+1], 0 // s[AddressBias] == 0 ? @@ -4133,7 +4053,7 @@ s_cbranch_scc0 label_BiasAddrValid // branch if s[AddressBias] ! s_mov_b32 s[sgprSrdBias+2], 0 s_branch label_BiasAddrValid_End label_BiasAddrValid: -s_mov_b32 s[sgprSrdBias+2], s67 +s_mov_b32 s[sgprSrdBias+2], s77 label_BiasAddrValid_End: label_Load_Biasf32_0: @@ -4143,15 +4063,15 @@ s_cbranch_scc1 label_Load_Biasbf16_0 // Branch if true /******************************************/ /* Read vector to LDS */ /******************************************/ -s_mul_i32 s67, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v8, s67, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v8, s77, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset s_mul_i32 s[sgprSrdBias+2], 0x4, s[sgprSrdBias+2] // scaled by BPE -s_mul_i32 s67, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG -v_add_u32 v6, s67, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG +s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG +v_add_u32 v6, s77, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG v_lshlrev_b32 v6, 0x2, v6 // Global bias address scaled by BPE v_lshlrev_b32 v7, 0x2, v8 // Global scaleAlpha address scaled by BPE -s_mul_i32 s67, 256, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v8, s67, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v8, s77, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset buffer_load_dword v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec v_lshlrev_b32 v8, 0x2, v[vgprSerial] // Local address scaled by BPE @@ -4170,15 +4090,15 @@ s_cbranch_scc1 label_Load_Bias_End // Branch if true /******************************************/ /* Read vector to LDS */ /******************************************/ -s_mul_i32 s67, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v8, s67, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v8, s77, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset s_mul_i32 s[sgprSrdBias+2], 0x2, s[sgprSrdBias+2] // scaled by BPE -s_mul_i32 s67, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG -v_add_u32 v6, s67, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG +s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG +v_add_u32 v6, s77, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG v_lshlrev_b32 v6, 0x1, v6 // Global bias address scaled by BPE v_lshlrev_b32 v7, 0x2, v8 // Global scaleAlpha address scaled by BPE -s_mul_i32 s67, 256, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v8, s67, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v8, s77, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset buffer_load_short_d16 v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec v_lshlrev_b32 v8, 0x2, v[vgprSerial] // Local address scaled by BPE @@ -4201,38 +4121,48 @@ s_waitcnt lgkmcnt(0) // wait for scaleAB load v_mul_f32 v4, v4, s8 v_mul_f32 v4, v4, s9 s_nop 0 // 1 wait states -s_mov_b32 s67, s[sgprAlpha] // Save alpha value +s_mov_b32 s64, s[sgprAlpha] // Save alpha value v_readfirstlane_b32 s[sgprAlpha], v4 // Update Alpha s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23 // Only branch on scc0 -s_getpc_b64 s[86:87] // addr of next instr -s_add_i32 s88, label_SK_Partials, 4 // target branch offset -s_add_u32 s86, s86, s88 // add target branch offset -s_addc_u32 s87, s87, 0 // add high and carry -s_setpc_b64 s[86:87] // branch to label_SK_Partials +s_getpc_b64 s[88:89] // addr of next instr +s_add_i32 s90, label_SK_Partials, 4 // target branch offset +s_add_u32 s88, s88, s90 // add target branch offset +s_addc_u32 s89, s89, 0 // add high and carry +s_setpc_b64 s[88:89] // branch to label_SK_Partials label_NoBranch_QWMA7J3AUDGL0X23: s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cbranch_scc1 label_SK_Store // Branch if started and finished tile, go to regular store code -s_add_u32 s85, s[sgprStreamKIdx], 1 // input partial tile index -s_mul_hi_u32 s75, s[sgprStreamKIterEnd], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s76, s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s74, s[sgprStreamKIterEnd], s76 // s_magic mul, div alg 2 -s_add_u32 s74, s74, s75 -s_and_b32 s76, s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s74, s74, s76 // sMagicDiv Alg 2 -s_mul_i32 s74, s74, s[sgprItersPerTile] // start iteration of partial tile -s_sub_u32 s86, s[sgprStreamKIterEnd], s74 // calc iterations completed by this WG +s_add_u32 s65, s[sgprStreamKIdx], 1 // input partial tile index +v_cvt_f32_u32 v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_rcp_iflag_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_cvt_f32_u32 v18, s[sgprStreamKIterEnd] // StreamKIterEnd // ItersPerTile +v_mul_f32 v17, v17, v18 // StreamKIterEnd // ItersPerTile +v_cvt_u32_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // StreamKIterEnd // ItersPerTile +v_cmpx_eq_u32 exec, v18, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_add_u32 v17, 1, v17 // StreamKIterEnd // ItersPerTile +v_mov_b32 v18, 0 // StreamKIterEnd // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v18, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v17, v17, 1 // quotient - 1 +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // re-calculate remainder +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s77, v17 // quotient +v_readfirstlane_b32 s68, v18 // remainder label_SK_Fixup: -s_lshl_b32 s74, s85, 2 // flag offset based on CTA index -s_load_dword s76, s[sgprAddressFlags:sgprAddressFlags+1], s74 glc // get flag +s_lshl_b32 s77, s65, 2 // flag offset based on CTA index +s_load_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // get flag s_waitcnt lgkmcnt(0) // wait for flag load -s_cmp_eq_u32 s76, 1 // check if ready +s_cmp_eq_u32 s79, 1 // check if ready s_cbranch_scc0 label_SK_Fixup // if flag not set, wait and check again s_barrier // wait for all workgroups before resetting flag -v_readfirstlane_b32 s76, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s76, 0 // Check for wave 0 +v_readfirstlane_b32 s79, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s79, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagReset // Skip flag reset -s_store_dword s76, s[sgprAddressFlags:sgprAddressFlags+1], s74 glc // reset flag +s_store_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // reset flag label_SK_SkipFlagReset: label_Fixup_E0: @@ -4241,8 +4171,8 @@ s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // s_mov_b32 s[sgprSrdWS+2], BufferOOB s_mov_b32 s[sgprSrdWS+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s74, 0x40000, s85 // Offset to correct partials tile -s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s74 // add lo to SRD +s_mul_i32 s78, 0x40000, s65 // Offset to correct partials tile +s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s78 // add lo to SRD s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ @@ -4253,45 +4183,45 @@ s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* calc coords, apply mask, and issue loads (if necessary) */ v_lshlrev_b32 v18, 5, v[vgprSerial] // v18 = v[vgprSerial] * 32 -s_mov_b32 s74, 0 // Init sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_mov_b32 s78, 0 // Init sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -4646,45 +4576,45 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc161 // copy acc to vreg[104] v_accvgpr_read_b32 v[vgprValuC+25], acc165 // copy acc to vreg[105] v_accvgpr_read_b32 v[vgprValuC+26], acc169 // copy acc to vreg[106] @@ -5039,24 +4969,24 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[72:75], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[76:79], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[80:83], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[84:87], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[72:75], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[76:79], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[80:83], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[84:87], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] @@ -5221,42 +5151,42 @@ v_accvgpr_write_b32 acc251, v[vgprValuC+70] // copy vreg[254] to acc v_accvgpr_write_b32 acc255, v[vgprValuC+71] // copy vreg[255] to acc s_nop 1 // 2 wait states required before reading vgpr s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_add_u32 s74, s[sgprSKItersPerWG], 1 // Add extra iter -s_cmp_lt_u32 s85, s[sgprskExtraIters] // Check if next WG had an extra iteration -s_cselect_b32 s74, s74, s[sgprSKItersPerWG] // Select correct number of iterations for next WG -s_add_u32 s86, s86, s74 // next partial tile iteration -s_add_u32 s85, s85, 1 // next partial tile index -s_cmp_lt_u32 s86, s[sgprItersPerTile] // done loading partial tiles? +s_add_u32 s69, s[sgprSKItersPerWG], 1 // Add extra iter +s_cmp_lt_u32 s65, s[sgprskExtraIters] // Check if next WG had an extra iteration +s_cselect_b32 s69, s69, s[sgprSKItersPerWG] // Select correct number of iterations for next WG +s_add_u32 s68, s68, s69 // next partial tile iteration +s_add_u32 s65, s65, 1 // next partial tile index +s_cmp_lt_u32 s68, s[sgprItersPerTile] // done loading partial tiles? s_cbranch_scc1 label_SK_Fixup // Branch to continue fixup loop label_SK_Store: s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 s_cbranch_scc0 label_GW_Beta // Branch if Beta is not zero -s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? -s_cselect_b32 s74, s74, 0 // set rMT0 -s_cmpk_gt_u32 s74, 0 // rMT0 > 0 +s_and_b32 s78, 255, s[sgprSizeI] // s78 = s[sgprSizeI] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s79 // wg0 >= nwg0-1 ? +s_cselect_b32 s78, s78, 0 // set rMT0 +s_cmpk_gt_u32 s78, 0 // rMT0 > 0 s_cbranch_scc0 label_NoBranch_0MXDW6EW9K7ZNG8F // Only branch on scc1 // jump if edges required -s_getpc_b64 s[74:75] // addr of next instr -s_add_i32 s76, label_GW_B0_E1_M, 4 // target branch offset -s_add_u32 s74, s74, s76 // add target branch offset -s_addc_u32 s75, s75, 0 // add high and carry -s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_M +s_getpc_b64 s[78:79] // addr of next instr +s_add_i32 s80, label_GW_B0_E1_M, 4 // target branch offset +s_add_u32 s78, s78, s80 // add target branch offset +s_addc_u32 s79, s79, 0 // add high and carry +s_setpc_b64 s[78:79] // branch to label_GW_B0_E1_M label_NoBranch_0MXDW6EW9K7ZNG8F: -s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 -s_cselect_b32 s74, s74, 0 // set rMT1 -s_cmpk_gt_u32 s74, 0 // rMT1 > 0 +s_and_b32 s78, 255, s[sgprSizeJ] // s78 = s[sgprSizeJ] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s79 // wg1 >= nwg1-1 +s_cselect_b32 s78, s78, 0 // set rMT1 +s_cmpk_gt_u32 s78, 0 // rMT1 > 0 s_cbranch_scc0 label_NoBranch_IXPKU979JKZCQDH3 // Only branch on scc1 // jump if edges required -s_getpc_b64 s[74:75] // addr of next instr -s_add_i32 s76, label_GW_B0_E1_N, 4 // target branch offset -s_add_u32 s74, s74, s76 // add target branch offset -s_addc_u32 s75, s75, 0 // add high and carry -s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_N +s_getpc_b64 s[78:79] // addr of next instr +s_add_i32 s80, label_GW_B0_E1_N, 4 // target branch offset +s_add_u32 s78, s78, s80 // add target branch offset +s_addc_u32 s79, s79, 0 // add high and carry +s_setpc_b64 s[78:79] // branch to label_GW_B0_E1_N label_NoBranch_IXPKU979JKZCQDH3: label_GW_B0_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -5265,28 +5195,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_0 // Branch if true label_To_Activation_None_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Gelu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Relu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Silu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_5 +label_To_Activation_Clamp_VW8_beta_0_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_ActivationSetPCAddrEnd_5: @@ -5301,8 +5239,8 @@ label_ActivationSetPCAddrEnd_5: /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v0, s74 +s_mul_i32 s68, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v0, s68 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -5431,7 +5369,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -5449,7 +5387,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -5458,8 +5396,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5470,7 +5408,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -5479,8 +5417,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5491,7 +5429,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -5500,8 +5438,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5512,7 +5450,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -5521,8 +5459,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5533,7 +5471,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -5542,8 +5480,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5554,7 +5492,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -5563,8 +5501,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5575,7 +5513,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -5584,8 +5522,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -5722,7 +5660,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -5731,8 +5669,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5743,7 +5681,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -5752,8 +5690,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5764,7 +5702,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -5773,8 +5711,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5785,7 +5723,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -5794,8 +5732,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5806,7 +5744,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -5815,8 +5753,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5827,7 +5765,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -5836,8 +5774,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5848,7 +5786,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -5857,8 +5795,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5869,7 +5807,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -5878,8 +5816,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6016,7 +5954,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6025,8 +5963,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6037,7 +5975,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6046,8 +5984,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6058,7 +5996,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6067,8 +6005,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6079,7 +6017,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6088,8 +6026,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6100,7 +6038,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6109,8 +6047,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6121,7 +6059,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6130,8 +6068,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6142,7 +6080,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6151,8 +6089,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6163,7 +6101,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6172,8 +6110,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6310,7 +6248,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6319,8 +6257,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6331,7 +6269,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6340,8 +6278,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6352,7 +6290,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6361,8 +6299,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6373,7 +6311,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6382,8 +6320,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6394,7 +6332,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6403,8 +6341,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6415,7 +6353,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6424,8 +6362,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6436,7 +6374,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6445,8 +6383,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6457,7 +6395,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6466,8 +6404,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6479,28 +6417,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_1 // Branch if true label_To_Activation_None_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Gelu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Relu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Silu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_4 +label_To_Activation_Clamp_VW8_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_ActivationSetPCAddrEnd_4: @@ -6516,11 +6462,11 @@ label_ActivationSetPCAddrEnd_4: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -6529,105 +6475,105 @@ ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -6740,7 +6686,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6758,7 +6704,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6776,7 +6722,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6794,7 +6740,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6812,7 +6758,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6830,7 +6776,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6848,7 +6794,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6866,7 +6812,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6892,116 +6838,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc1 // copy acc to vreg[64] v_accvgpr_read_b32 v[vgprValuC+25], acc5 // copy acc to vreg[65] v_accvgpr_read_b32 v[vgprValuC+26], acc9 // copy acc to vreg[66] @@ -7114,7 +7060,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7132,7 +7078,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7150,7 +7096,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7168,7 +7114,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7186,7 +7132,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7204,7 +7150,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7222,7 +7168,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7240,7 +7186,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -7266,116 +7212,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc2 // copy acc to vreg[128] v_accvgpr_read_b32 v[vgprValuC+25], acc6 // copy acc to vreg[129] v_accvgpr_read_b32 v[vgprValuC+26], acc10 // copy acc to vreg[130] @@ -7488,7 +7434,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7506,7 +7452,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7524,7 +7470,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7542,7 +7488,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7560,7 +7506,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7578,7 +7524,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7596,7 +7542,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7614,7 +7560,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -7640,116 +7586,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -7862,7 +7808,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7880,7 +7826,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7898,7 +7844,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7916,7 +7862,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7934,7 +7880,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7952,7 +7898,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7970,7 +7916,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7988,7 +7934,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -8007,28 +7953,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true label_To_Activation_None_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Gelu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Relu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Silu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_3 +label_To_Activation_Clamp_VW1_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_ActivationSetPCAddrEnd_3: @@ -8044,492 +7998,492 @@ label_ActivationSetPCAddrEnd_3: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v0, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v0, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v0, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v0, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v0, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -8608,271 +8562,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -8888,494 +8842,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v0, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v0, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v0, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v0, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v0, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v0, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc180 // copy acc to vreg[45] v_accvgpr_read_b32 v[vgprValuC+18], acc184 // copy acc to vreg[46] v_accvgpr_read_b32 v[vgprValuC+19], acc188 // copy acc to vreg[47] @@ -9454,271 +9408,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -9734,490 +9688,490 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v0, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v0, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v0, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v0, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v0, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc105 // copy acc to vreg[90] v_accvgpr_read_b32 v[vgprValuC+18], acc109 // copy acc to vreg[91] v_accvgpr_read_b32 v[vgprValuC+19], acc113 // copy acc to vreg[92] @@ -10296,271 +10250,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -10576,494 +10530,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v0, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v0, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v0, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc30 // copy acc to vreg[135] v_accvgpr_read_b32 v[vgprValuC+18], acc34 // copy acc to vreg[136] v_accvgpr_read_b32 v[vgprValuC+19], acc38 // copy acc to vreg[137] @@ -11142,271 +11096,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -11422,494 +11376,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v0, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v0, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v0, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v0, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc210 // copy acc to vreg[180] v_accvgpr_read_b32 v[vgprValuC+18], acc214 // copy acc to vreg[181] v_accvgpr_read_b32 v[vgprValuC+19], acc218 // copy acc to vreg[182] @@ -11988,271 +11942,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -12268,342 +12222,342 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v51, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v51, v4, s78 v_lshlrev_b32 v51, 0x2, v51 // Bias address scaled by BPE ds_read_b32 v48, v51 offset:0 // load Bias ds_read_b32 v49, v51 offset:1024 // load scaleAlpha v_add_lshl_u32 v50, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v50, v12, v50, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v50, v12, v50, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v55, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v55, v4, s78 v_lshlrev_b32 v55, 0x2, v55 // Bias address scaled by BPE ds_read_b32 v52, v55 offset:0 // load Bias ds_read_b32 v53, v55 offset:1024 // load scaleAlpha v_add_lshl_u32 v54, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v54, v12, v54, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v54, v12, v54, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v63, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v63, v4, s78 v_lshlrev_b32 v63, 0x2, v63 // Bias address scaled by BPE ds_read_b32 v60, v63 offset:0 // load Bias ds_read_b32 v61, v63 offset:1024 // load scaleAlpha v_add_lshl_u32 v62, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v62, v12, v62, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v62, v12, v62, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v67, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v67, v4, s78 v_lshlrev_b32 v67, 0x2, v67 // Bias address scaled by BPE ds_read_b32 v64, v67 offset:0 // load Bias ds_read_b32 v65, v67 offset:1024 // load scaleAlpha v_add_lshl_u32 v66, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v66, v12, v66, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v66, v12, v66, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v4, s78 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v12, v70, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v70, v12, v70, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v75, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v75, v4, s78 v_lshlrev_b32 v75, 0x2, v75 // Bias address scaled by BPE ds_read_b32 v72, v75 offset:0 // load Bias ds_read_b32 v73, v75 offset:1024 // load scaleAlpha v_add_lshl_u32 v74, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v74, v12, v74, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v74, v12, v74, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v0, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v83, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v83, v4, s78 v_lshlrev_b32 v83, 0x2, v83 // Bias address scaled by BPE v_add_lshl_u32 v82, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v82, v12, v82, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v82, v12, v82, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v87, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v87, v4, s78 v_lshlrev_b32 v87, 0x2, v87 // Bias address scaled by BPE v_add_lshl_u32 v86, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v4, s78 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE v_add_lshl_u32 v90, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v12, v90, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v90, v12, v90, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v0, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v0, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] @@ -12661,204 +12615,204 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v49, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v53, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v61, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v65, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v69, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v73, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v77, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v49, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v53, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v61, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v65, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v69, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v73, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v77, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v49, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v53, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v61, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v65, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v69, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v73, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v77, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v49, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v53, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v61, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v65, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v69, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v73, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_GW_Beta: -s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? -s_cselect_b32 s74, s74, 0 // set rMT0 -s_cmpk_gt_u32 s74, 0 // rMT0 > 0 +s_and_b32 s78, 255, s[sgprSizeI] // s78 = s[sgprSizeI] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s79 // wg0 >= nwg0-1 ? +s_cselect_b32 s78, s78, 0 // set rMT0 +s_cmpk_gt_u32 s78, 0 // rMT0 > 0 s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required -s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 -s_cselect_b32 s74, s74, 0 // set rMT1 -s_cmpk_gt_u32 s74, 0 // rMT1 > 0 +s_and_b32 s78, 255, s[sgprSizeJ] // s78 = s[sgprSizeJ] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s79 // wg1 >= nwg1-1 +s_cselect_b32 s78, s78, 0 // set rMT1 +s_cmpk_gt_u32 s78, 0 // rMT1 > 0 s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required label_GW_B1_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -12867,28 +12821,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_0 // Branch if true label_To_Activation_None_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Gelu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Relu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Silu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_2 +label_To_Activation_Clamp_VW8_beta_1_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_ActivationSetPCAddrEnd_2: @@ -12905,8 +12867,8 @@ label_ActivationSetPCAddrEnd_2: /* (d1,vc1,d0,vc0)=(0,0,0,0) */ v_add_lshl_u32 v18, v2, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v0, s74 +s_mul_i32 s68, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v0, s68 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -12915,33 +12877,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,1,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,2,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,3,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,4,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,5,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,6,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_add_lshl_u32 v17, v3, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 @@ -13062,7 +13024,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13098,7 +13060,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13107,8 +13069,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13137,7 +13099,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13146,8 +13108,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13176,7 +13138,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -13185,8 +13147,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13215,7 +13177,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -13224,8 +13186,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13254,7 +13216,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -13263,8 +13225,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13293,7 +13255,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -13302,8 +13264,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -13316,8 +13278,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,7,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -13325,33 +13287,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,8,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,9,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,10,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,11,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,12,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,13,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc224 // copy acc to vreg[56] @@ -13471,7 +13433,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13480,8 +13442,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13510,7 +13472,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13519,8 +13481,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13549,7 +13511,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13558,8 +13520,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13588,7 +13550,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -13597,8 +13559,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13627,7 +13589,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -13636,8 +13598,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13666,7 +13628,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -13675,8 +13637,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13705,7 +13667,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -13714,8 +13676,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -13728,8 +13690,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,14,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -13737,33 +13699,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,15,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,16,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,17,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,18,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,19,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,20,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc193 // copy acc to vreg[112] @@ -13883,7 +13845,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13892,8 +13854,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13922,7 +13884,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13931,8 +13893,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13961,7 +13923,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13970,8 +13932,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14000,7 +13962,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14009,8 +13971,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14039,7 +14001,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -14048,8 +14010,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14078,7 +14040,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -14087,8 +14049,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14117,7 +14079,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -14126,8 +14088,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14140,8 +14102,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,21,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -14149,33 +14111,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,22,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,23,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,24,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,25,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,26,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,27,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc162 // copy acc to vreg[168] @@ -14295,7 +14257,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -14304,8 +14266,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14334,7 +14296,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -14343,8 +14305,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14373,7 +14335,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -14382,8 +14344,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14412,7 +14374,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14421,8 +14383,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14451,7 +14413,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -14460,8 +14422,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14490,7 +14452,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -14499,8 +14461,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14529,7 +14491,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -14538,8 +14500,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14552,8 +14514,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,28,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[56:59], v19 offset:0 // load Bias @@ -14561,18 +14523,18 @@ ds_read_b128 v[60:63], v19 offset:16 // load Bias ds_read_b128 v[64:67], v19 offset:1024 // load scaleAlpha ds_read_b128 v[68:71], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,29,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,30,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,31,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[80:83], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc131 // copy acc to vreg[224] @@ -14656,7 +14618,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -14665,8 +14627,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14695,7 +14657,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -14704,8 +14666,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14734,7 +14696,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -14743,8 +14705,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14773,7 +14735,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14782,8 +14744,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14795,28 +14757,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_1 // Branch if true label_To_Activation_None_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Gelu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Relu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Silu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_1 +label_To_Activation_Clamp_VW8_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_ActivationSetPCAddrEnd_1: @@ -14832,14 +14802,14 @@ label_ActivationSetPCAddrEnd_1: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -14848,92 +14818,92 @@ ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -15038,7 +15008,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15072,7 +15042,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15106,7 +15076,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15140,7 +15110,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15174,7 +15144,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -15208,7 +15178,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -15234,106 +15204,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] v_accvgpr_read_b32 v[vgprValuC+25], acc196 // copy acc to vreg[49] v_accvgpr_read_b32 v[vgprValuC+26], acc200 // copy acc to vreg[50] @@ -15438,7 +15408,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15472,7 +15442,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15506,7 +15476,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15540,7 +15510,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15574,7 +15544,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -15608,7 +15578,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -15634,106 +15604,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -15838,7 +15808,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15872,7 +15842,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15906,7 +15876,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15940,7 +15910,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15974,7 +15944,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16008,7 +15978,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16034,106 +16004,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] @@ -16238,7 +16208,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16272,7 +16242,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16306,7 +16276,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -16340,7 +16310,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -16374,7 +16344,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16408,7 +16378,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16434,106 +16404,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -16638,7 +16608,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16672,7 +16642,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16706,7 +16676,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -16740,7 +16710,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -16774,7 +16744,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16808,7 +16778,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16834,38 +16804,38 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[40:43], v18 offset:0 // load Bias ds_read_b128 v[44:47], v18 offset:16 // load Bias ds_read_b128 v[48:51], v18 offset:1024 // load scaleAlpha ds_read_b128 v[52:55], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[56:59], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v60, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v60, v0, s78 v_lshlrev_b32 v60, 0x2, v60 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] v_accvgpr_read_b32 v[vgprValuC+25], acc199 // copy acc to vreg[241] v_accvgpr_read_b32 v[vgprValuC+26], acc203 // copy acc to vreg[242] @@ -16922,7 +16892,7 @@ v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[44:45], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[46:47], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16956,7 +16926,7 @@ v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[44:45], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[46:47], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16975,28 +16945,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_1_edge_1 // Branch if true label_To_Activation_None_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Gelu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Relu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Silu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd +label_To_Activation_Clamp_VW1_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_ActivationSetPCAddrEnd: @@ -17012,532 +16990,532 @@ label_ActivationSetPCAddrEnd: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v0, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v0, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v0, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -17608,7 +17586,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17616,7 +17594,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17624,7 +17602,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17632,7 +17610,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17640,7 +17618,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17648,7 +17626,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17656,7 +17634,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17664,7 +17642,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17672,7 +17650,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17680,7 +17658,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17688,7 +17666,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17696,7 +17674,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17704,7 +17682,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17712,7 +17690,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17720,7 +17698,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17728,7 +17706,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17736,7 +17714,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17744,7 +17722,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17752,7 +17730,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17760,7 +17738,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17768,7 +17746,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17776,7 +17754,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17784,7 +17762,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17792,7 +17770,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17800,7 +17778,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17808,7 +17786,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17816,7 +17794,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17824,7 +17802,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17832,7 +17810,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17840,7 +17818,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17848,7 +17826,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17856,7 +17834,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17864,7 +17842,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17872,7 +17850,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17880,7 +17858,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17888,7 +17866,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17896,7 +17874,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17904,7 +17882,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17920,534 +17898,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v0, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc152 // copy acc to vreg[38] v_accvgpr_read_b32 v[vgprValuC+18], acc156 // copy acc to vreg[39] v_accvgpr_read_b32 v[vgprValuC+19], acc160 // copy acc to vreg[40] @@ -18518,7 +18496,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18526,7 +18504,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18534,7 +18512,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18542,7 +18520,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18550,7 +18528,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18558,7 +18536,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18566,7 +18544,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18574,7 +18552,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18582,7 +18560,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18590,7 +18568,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18598,7 +18576,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18606,7 +18584,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18614,7 +18592,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18622,7 +18600,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18630,7 +18608,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18638,7 +18616,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18646,7 +18624,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18654,7 +18632,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18662,7 +18640,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18670,7 +18648,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18678,7 +18656,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18686,7 +18664,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18694,7 +18672,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18702,7 +18680,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18710,7 +18688,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18718,7 +18696,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18726,7 +18704,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18734,7 +18712,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18742,7 +18720,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18750,7 +18728,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18758,7 +18736,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18766,7 +18744,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18774,7 +18752,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18782,7 +18760,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18790,7 +18768,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18798,7 +18776,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18806,7 +18784,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18814,7 +18792,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18830,534 +18808,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v0, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v0, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v0, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v0, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v0, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc49 // copy acc to vreg[76] v_accvgpr_read_b32 v[vgprValuC+18], acc53 // copy acc to vreg[77] v_accvgpr_read_b32 v[vgprValuC+19], acc57 // copy acc to vreg[78] @@ -19428,7 +19406,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19436,7 +19414,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19444,7 +19422,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19452,7 +19430,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19460,7 +19438,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19468,7 +19446,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19476,7 +19454,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19484,7 +19462,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19492,7 +19470,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19500,7 +19478,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19508,7 +19486,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19516,7 +19494,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19524,7 +19502,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19532,7 +19510,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19540,7 +19518,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19548,7 +19526,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19556,7 +19534,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19564,7 +19542,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19572,7 +19550,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19580,7 +19558,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19588,7 +19566,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19596,7 +19574,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19604,7 +19582,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19612,7 +19590,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19620,7 +19598,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19628,7 +19606,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19636,7 +19614,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19644,7 +19622,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19652,7 +19630,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19660,7 +19638,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19668,7 +19646,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19676,7 +19654,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19684,7 +19662,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19692,7 +19670,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19700,7 +19678,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19708,7 +19686,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19716,7 +19694,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19724,7 +19702,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19740,530 +19718,530 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v0, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v0, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v0, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v0, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc201 // copy acc to vreg[114] v_accvgpr_read_b32 v[vgprValuC+18], acc205 // copy acc to vreg[115] v_accvgpr_read_b32 v[vgprValuC+19], acc209 // copy acc to vreg[116] @@ -20334,7 +20312,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20342,7 +20320,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20350,7 +20328,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20358,7 +20336,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20366,7 +20344,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20374,7 +20352,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20382,7 +20360,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20390,7 +20368,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20398,7 +20376,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20406,7 +20384,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20414,7 +20392,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20422,7 +20400,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20430,7 +20408,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20438,7 +20416,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20446,7 +20424,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20454,7 +20432,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20462,7 +20440,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20470,7 +20448,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20478,7 +20456,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20486,7 +20464,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20494,7 +20472,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20502,7 +20480,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20510,7 +20488,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20518,7 +20496,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20526,7 +20504,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20534,7 +20512,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20542,7 +20520,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20550,7 +20528,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20558,7 +20536,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20566,7 +20544,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20574,7 +20552,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20582,7 +20560,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20590,7 +20568,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20598,7 +20576,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20606,7 +20584,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20614,7 +20592,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20622,7 +20600,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20630,7 +20608,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20650,530 +20628,530 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v0, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v0, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v0, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc98 // copy acc to vreg[152] v_accvgpr_read_b32 v[vgprValuC+18], acc102 // copy acc to vreg[153] v_accvgpr_read_b32 v[vgprValuC+19], acc106 // copy acc to vreg[154] @@ -21244,7 +21222,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21252,7 +21230,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21260,7 +21238,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21268,7 +21246,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21276,7 +21254,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21284,7 +21262,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21292,7 +21270,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21300,7 +21278,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21308,7 +21286,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21316,7 +21294,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21324,7 +21302,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21332,7 +21310,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21340,7 +21318,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21348,7 +21326,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21356,7 +21334,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21364,7 +21342,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21372,7 +21350,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21380,7 +21358,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21388,7 +21366,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21396,7 +21374,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21404,7 +21382,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21412,7 +21390,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21420,7 +21398,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21428,7 +21406,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21436,7 +21414,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21444,7 +21422,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21452,7 +21430,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21460,7 +21438,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21468,7 +21446,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21476,7 +21454,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21484,7 +21462,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21492,7 +21470,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21500,7 +21478,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21508,7 +21486,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21516,7 +21494,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21524,7 +21502,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21532,7 +21510,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21540,7 +21518,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21556,534 +21534,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v0, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc250 // copy acc to vreg[190] v_accvgpr_read_b32 v[vgprValuC+18], acc254 // copy acc to vreg[191] v_accvgpr_read_b32 v[vgprValuC+19], acc3 // copy acc to vreg[192] @@ -22154,7 +22132,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22162,7 +22140,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22170,7 +22148,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22178,7 +22156,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22186,7 +22164,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22194,7 +22172,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22202,7 +22180,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22210,7 +22188,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22218,7 +22196,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22226,7 +22204,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22234,7 +22212,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22242,7 +22220,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22250,7 +22228,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22258,7 +22236,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22266,7 +22244,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22274,7 +22252,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22282,7 +22260,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22290,7 +22268,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22298,7 +22276,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22306,7 +22284,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22314,7 +22292,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22322,7 +22300,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22330,7 +22308,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22338,7 +22316,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22346,7 +22324,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22354,7 +22332,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22362,7 +22340,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22370,7 +22348,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22378,7 +22356,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22386,7 +22364,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22394,7 +22372,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22402,7 +22380,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22410,7 +22388,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22418,7 +22396,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22426,7 +22404,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22434,7 +22412,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22442,7 +22420,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22450,7 +22428,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22466,396 +22444,396 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v48, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v12, v48, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v48, v12, v48, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v45, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v49, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v49, v4, s78 v_lshlrev_b32 v49, 0x2, v49 // Bias address scaled by BPE ds_read_b32 v46, v49 offset:0 // load Bias ds_read_b32 v47, v49 offset:1024 // load scaleAlpha v_add_lshl_u32 v48, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v12, v48, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v48, v12, v48, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v53, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v12, v53, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v53, v12, v53, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v50, v53, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v54, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v54, v4, s78 v_lshlrev_b32 v54, 0x2, v54 // Bias address scaled by BPE ds_read_b32 v51, v54 offset:0 // load Bias ds_read_b32 v52, v54 offset:1024 // load scaleAlpha v_add_lshl_u32 v53, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v12, v53, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v53, v12, v53, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v86, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v86, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v87, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v87, v4, s78 v_lshlrev_b32 v87, 0x2, v87 // Bias address scaled by BPE v_add_lshl_u32 v86, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v89, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v12, v89, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v89, v12, v89, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v88, v89, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v90, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v90, v4, s78 v_lshlrev_b32 v90, 0x2, v90 // Bias address scaled by BPE v_add_lshl_u32 v89, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v12, v89, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v89, v12, v89, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v92, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v94, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s78 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v98, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v0, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v101, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v12, v101, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v101, v12, v101, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v4, s78 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v12, v101, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v101, v12, v101, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v104, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v110, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v113, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v12, v113, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v113, v12, v113, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s78 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v12, v113, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v113, v12, v113, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v116, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v119, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v12, v119, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v119, v12, v119, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s78 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v12, v119, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v119, v12, v119, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v122, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v0, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v125, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v12, v125, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v125, v12, v125, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s78 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v12, v125, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v125, v12, v125, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v128, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v131, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v12, v131, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v131, v12, v131, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v4, s78 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v12, v131, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v131, v12, v131, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v137, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v140, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v12, v140, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v140, v12, v140, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v4, s78 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v12, v140, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v140, v12, v140, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v143, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v146, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v12, v146, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v146, v12, v146, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v4, s78 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v12, v146, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v146, v12, v146, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc147 // copy acc to vreg[228] v_accvgpr_read_b32 v[vgprValuC+18], acc151 // copy acc to vreg[229] v_accvgpr_read_b32 v[vgprValuC+19], acc155 // copy acc to vreg[230] @@ -22911,7 +22889,7 @@ v_mul_f32 v[vgprValuC+17], v47, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v45 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22919,7 +22897,7 @@ v_mul_f32 v[vgprValuC+18], v52, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v50 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22927,7 +22905,7 @@ v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22935,7 +22913,7 @@ v_mul_f32 v[vgprValuC+20], v62, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22943,7 +22921,7 @@ v_mul_f32 v[vgprValuC+21], v67, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22951,7 +22929,7 @@ v_mul_f32 v[vgprValuC+22], v72, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22959,7 +22937,7 @@ v_mul_f32 v[vgprValuC+23], v77, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22967,7 +22945,7 @@ v_mul_f32 v[vgprValuC+24], v82, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22975,7 +22953,7 @@ v_mul_f32 v[vgprValuC+25], v47, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22983,7 +22961,7 @@ v_mul_f32 v[vgprValuC+26], v52, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v88 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22991,7 +22969,7 @@ v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v91 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22999,7 +22977,7 @@ v_mul_f32 v[vgprValuC+28], v62, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v94 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23007,7 +22985,7 @@ v_mul_f32 v[vgprValuC+29], v67, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23015,7 +22993,7 @@ v_mul_f32 v[vgprValuC+30], v72, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23023,7 +23001,7 @@ v_mul_f32 v[vgprValuC+31], v77, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23031,7 +23009,7 @@ v_mul_f32 v[vgprValuC+32], v82, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23039,7 +23017,7 @@ v_mul_f32 v[vgprValuC+33], v47, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23047,7 +23025,7 @@ v_mul_f32 v[vgprValuC+34], v52, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23055,7 +23033,7 @@ v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23063,7 +23041,7 @@ v_mul_f32 v[vgprValuC+36], v62, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23071,7 +23049,7 @@ v_mul_f32 v[vgprValuC+37], v67, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23079,7 +23057,7 @@ v_mul_f32 v[vgprValuC+38], v72, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23087,7 +23065,7 @@ v_mul_f32 v[vgprValuC+39], v77, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23095,7 +23073,7 @@ v_mul_f32 v[vgprValuC+40], v82, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23103,7 +23081,7 @@ v_mul_f32 v[vgprValuC+41], v47, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23111,7 +23089,7 @@ v_mul_f32 v[vgprValuC+42], v52, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23119,7 +23097,7 @@ v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23127,14 +23105,14 @@ v_mul_f32 v[vgprValuC+44], v62, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_Activation_None_VW8: -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Gelu_VW8: v_mul_f32 v12, 0x3d372713, v4 // k1 * x v_fma_f32 v12, v4, v12, 1.0 // 1 + (k1 * x * x) @@ -23232,7 +23210,7 @@ s_nop 0 // 1 wait states v_fma_f32 v12, -2.0, v12, 2.0 // ( + 1 (fused)) v_mul_f32 v12, v11, v12 // x * (1 + tanh(...)) v_mul_f32 v11, 0.5, v12 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Relu_VW8: v_max_f32 v4, v4, 0 // x = max(0, x) v_max_f32 v5, v5, 0 // x = max(0, x) @@ -23242,7 +23220,7 @@ v_max_f32 v8, v8, 0 // x = max(0, x) v_max_f32 v9, v9, 0 // x = max(0, x) v_max_f32 v10, v10, 0 // x = max(0, x) v_max_f32 v11, v11, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Silu_VW8: v_mul_f32 v12, -1.4426950408889634, v4 // (fused -1.442695) v_exp_f32 v12, v12 // exp step 2 @@ -23300,9 +23278,27 @@ v_add_f32 v12, 1.0, v12 // 1 + exp(-x) v_rcp_f32 v12, v12 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v11, v11, v12 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] +label_Activation_Clamp_VW8: +v_min_f32 v4, s[sgpractivationBeta], v4 // min(x, beta) +v_max_f32 v4, s[sgpractivationAlpha], v4 // max(alpha, min(x, beta)) +v_min_f32 v5, s[sgpractivationBeta], v5 // min(x, beta) +v_max_f32 v5, s[sgpractivationAlpha], v5 // max(alpha, min(x, beta)) +v_min_f32 v6, s[sgpractivationBeta], v6 // min(x, beta) +v_max_f32 v6, s[sgpractivationAlpha], v6 // max(alpha, min(x, beta)) +v_min_f32 v7, s[sgpractivationBeta], v7 // min(x, beta) +v_max_f32 v7, s[sgpractivationAlpha], v7 // max(alpha, min(x, beta)) +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +v_min_f32 v9, s[sgpractivationBeta], v9 // min(x, beta) +v_max_f32 v9, s[sgpractivationAlpha], v9 // max(alpha, min(x, beta)) +v_min_f32 v10, s[sgpractivationBeta], v10 // min(x, beta) +v_max_f32 v10, s[sgpractivationAlpha], v10 // max(alpha, min(x, beta)) +v_min_f32 v11, s[sgpractivationBeta], v11 // min(x, beta) +v_max_f32 v11, s[sgpractivationAlpha], v11 // max(alpha, min(x, beta)) +s_setpc_b64 s[66:67] label_Activation_None_VW1: -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Gelu_VW1: v_mul_f32 v12, 0x3d372713, v4 // k1 * x v_fma_f32 v12, v4, v12, 1.0 // 1 + (k1 * x * x) @@ -23316,10 +23312,10 @@ s_nop 0 // 1 wait states v_fma_f32 v12, -2.0, v12, 2.0 // ( + 1 (fused)) v_mul_f32 v12, v4, v12 // x * (1 + tanh(...)) v_mul_f32 v4, 0.5, v12 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Relu_VW1: v_max_f32 v4, v4, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Silu_VW1: v_mul_f32 v12, -1.4426950408889634, v4 // (fused -1.442695) v_exp_f32 v12, v12 // exp step 2 @@ -23328,7 +23324,11 @@ v_add_f32 v12, 1.0, v12 // 1 + exp(-x) v_rcp_f32 v12, v12 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v4, v4, v12 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] +label_Activation_Clamp_VW1: +v_min_f32 v4, s[sgpractivationBeta], v4 // min(x, beta) +v_max_f32 v4, s[sgpractivationAlpha], v4 // max(alpha, min(x, beta)) +s_setpc_b64 s[66:67] label_SK_Partials: label_GW_Partials_E0: s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address @@ -23726,24 +23726,24 @@ s_nop 0 // 1 wait state required when s_waitcnt vmcnt(0) // wait for data store s_barrier // store all data before setting flag s_lshl_b32 s8, s[sgprStreamKIdx], 2 // flag offset based on CTA index -v_readfirstlane_b32 s72, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s72, 0 // Check for wave 0 +v_readfirstlane_b32 s65, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s65, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagSet // Skip flag set -s_mov_b32 s72, 1 // flag data -s_store_dword s72, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag +s_mov_b32 s65, 1 // flag data +s_store_dword s65, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag label_SK_SkipFlagSet: s_waitcnt lgkmcnt(0) // wait for flag s_branch label_GW_End // jump to end label_GW_End: -s_mov_b32 s[sgprAlpha], s67 // Restore alpha value +s_mov_b32 s[sgprAlpha], s64 // Restore alpha value s_cmp_ge_u32 s[sgprStreamKIter], s[sgprStreamKIterEnd] // Check if done all StreamK iterations s_cbranch_scc1 label_NoBranch_Y57Y54XUE2DV604X // Only branch on scc0 -s_getpc_b64 s[72:73] // addr of next instr -s_add_i32 s74, label_PersistentLoopStart, 4 // target branch offset -s_abs_i32 s74, s74 // abs offset -s_sub_u32 s72, s72, s74 // sub target branch offset -s_subb_u32 s73, s73, 0 // sub high and carry -s_setpc_b64 s[72:73] // branch to label_PersistentLoopStart +s_getpc_b64 s[64:65] // addr of next instr +s_add_i32 s66, label_PersistentLoopStart, 4 // target branch offset +s_abs_i32 s66, s66 // abs offset +s_sub_u32 s64, s64, s66 // sub target branch offset +s_subb_u32 s65, s65, 0 // sub high and carry +s_setpc_b64 s[64:65] // branch to label_PersistentLoopStart label_NoBranch_Y57Y54XUE2DV604X: label_KernelEnd: s_endpgm // Kernel End diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s index c6046df1251e..d133cd9e9d7d 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_F8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.s @@ -30,7 +30,7 @@ .text /* Num VGPR =249 */ /* Num AccVGPR=256 */ -/* Num SGPR =111 */ +/* Num SGPR =105 */ /******************************************/ /* Optimizations and Config: */ @@ -230,118 +230,83 @@ amdhsa.kernels: .offset: 116 .value_kind: by_value .value_type: f32 - - .name: MagicNumberProblemNumGroupTiles0 + - .name: ItersPerTile .size: 4 .offset: 120 .value_kind: by_value .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0 + - .name: TotalIters .size: 4 .offset: 124 .value_kind: by_value .value_type: u32 - - .name: ItersPerTile + - .name: SKItersPerWG .size: 4 .offset: 128 .value_kind: by_value .value_type: u32 - - .name: MagicNumberItersPerTile + - .name: skGridAndTiles .size: 4 .offset: 132 .value_kind: by_value .value_type: u32 - - .name: MagicShiftItersPerTile - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumProblemNumGroupTiles0By1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0By1 - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - - .name: TotalIters - .size: 4 - .offset: 148 - .value_kind: by_value - .value_type: u32 - - .name: SKItersPerWG - .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: skGrid - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: skTiles - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - .name: skExtraIters .size: 4 - .offset: 164 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: AddressScaleA .size: 8 - .offset: 168 + .offset: 140 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: AddressScaleB .size: 8 - .offset: 176 + .offset: 148 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: AddressScaleAlphaVec .size: 8 - .offset: 184 + .offset: 156 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: bias .size: 8 - .offset: 192 + .offset: 164 .value_kind: global_buffer .value_type: void .address_space: generic - .name: biasType .size: 4 - .offset: 200 + .offset: 172 .value_kind: by_value .value_type: u32 - .name: StrideBias .size: 4 - .offset: 204 + .offset: 176 .value_kind: by_value .value_type: u32 - .name: activationAlpha .size: 4 - .offset: 208 + .offset: 180 .value_kind: by_value .value_type: f32 - .name: activationBeta .size: 4 - .offset: 212 + .offset: 184 .value_kind: by_value .value_type: f32 - .name: activationType .size: 4 - .offset: 216 + .offset: 188 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 135168 .kernarg_segment_align: 8 - .kernarg_segment_size: 224 + .kernarg_segment_size: 192 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 102 @@ -352,7 +317,6 @@ amdhsa.kernels: ... .end_amdgpu_metadata Custom_Cijk_Alik_Bljk_F8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950: -label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA @@ -413,28 +377,21 @@ label_ASM_Start: /// Main body of the asm kernel .set sgprStridesB, 42 .set sgprAlpha, 44 .set sgprBeta, 45 -.set sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgprItersPerTile, 48 -.set sgprMagicNumberItersPerTile, 49 -.set sgprMagicShiftItersPerTile, 50 -.set sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgprMagicShiftProblemNumGroupTiles0By1, 52 -.set sgprTotalIters, 53 -.set sgprSKItersPerWG, 54 -.set sgprskGrid, 55 -.set sgprskTiles, 56 -.set sgprskExtraIters, 57 -.set sgprLocalWriteAddrA, 58 -.set sgprLocalWriteAddrB, 59 -.set sgprSwapA, 60 -.set sgprSwapB, 61 -.set sgprStreamKIdx, 62 -.set sgprStreamKIter, 63 -.set sgprStreamKIterEnd, 64 -.set sgprStreamKLocalStart, 65 -.set sgprStreamKLocalEnd, 66 -.set sgprSrdWS, 68 +.set sgprItersPerTile, 46 +.set sgprTotalIters, 47 +.set sgprSKItersPerWG, 48 +.set sgprskGridAndTiles, 49 +.set sgprskExtraIters, 50 +.set sgprLocalWriteAddrA, 51 +.set sgprLocalWriteAddrB, 52 +.set sgprSwapA, 53 +.set sgprSwapB, 54 +.set sgprStreamKIdx, 55 +.set sgprStreamKIter, 56 +.set sgprStreamKIterEnd, 57 +.set sgprStreamKLocalStart, 58 +.set sgprStreamKLocalEnd, 59 +.set sgprSrdWS, 60 /* Size Assignments */ .set sgprSizeI, sgprSizesFree+0 @@ -515,29 +472,30 @@ label_ASM_Start: /// Main body of the asm kernel /******************************************/ /* Load num of Gemms */ -s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 +s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 /* Load packed kernel args (StaggerU/GSU) */ -s_load_dword s73, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 +s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 /* Load WGM data */ s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 /* Load num of WGs */ -s_load_dword s74, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 s_waitcnt lgkmcnt(0) // load args -s_lshr_b32 s72, s67, 0x1e // Get arg type -s_and_b32 s67, 0x3fffffff, s67 // Get nums of gemm -s_cmp_eq_u32 s72, 0 // Is kernel args +s_lshr_b32 s65, s64, 0x1e // Get arg type +s_and_b32 s64, 0x3fffffff, s64 // Get nums of gemm +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 -s_load_dwordx16 s[36:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_waitcnt lgkmcnt(0) // preload s_branch label_LoadArgsEnd label_HBMArgs: @@ -548,9 +506,7 @@ s_waitcnt lgkmcnt(0) // wait for args to load label_LoadArgsEnd: s_branch label_common_kernel_entry -/* pad 35 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ -s_nop 0 -s_nop 0 +/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ s_nop 0 s_nop 0 s_nop 0 @@ -585,10 +541,10 @@ s_nop 0 s_nop 0 s_nop 0 label_Preload_Offset_Start: -s_and_b32 s67, 0x3fffffff, s2 // Get nums of gemm -s_lshr_b32 s72, s2, 0x1e // Get arg type -s_mov_b32 s73, s3 // Preload internal args -s_cmp_eq_u32 s72, 0 // Is kernel args +s_and_b32 s64, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s65, s2, 0x1e // Get arg type +s_mov_b32 s66, s3 // Preload internal args +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_Preload_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 @@ -596,9 +552,9 @@ s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 -s_load_dwordx8 s[44:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_mov_b64 s[20:21], s[6:7] // move preload data to correct sgpr s_mov_b64 s[22:23], s[8:9] // move preload data to correct sgpr s_mov_b64 s[24:25], s[10:11] // move preload data to correct sgpr @@ -608,90 +564,90 @@ label_Preload_HBMArgs: s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments label_Preload_LoadArgsEnd: s_mov_b32 s[sgprWGM], s4 // Preload internal args2 -s_mov_b32 s74, s5 // Load num of WGs +s_mov_b32 s67, s5 // Load num of WGs label_common_kernel_entry: /// for both preload/non-preload common code s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id -s_and_b32 s[sgprStaggerU], s73, 0xffff0000 // Restore StaggerU related vars +s_and_b32 s[sgprStaggerU], s66, 0xffff0000 // Restore StaggerU related vars s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 -s_mov_b32 s[sgprArgType], s72 +s_mov_b32 s[sgprArgType], s65 s_mov_b32 m0, 0x21000 // LDS clamp at 135168 bytes v_mov_b32 v[vgprSerial], v0 // thread serial id /* remap workgroup to XCCs */ -s_lshr_b32 s80, s[sgprWGM], 0x10 // Get WGMXCC -s_ff1_i32_b32 s80, s80 // Get log(WGMXCC) -s_lshr_b32 s81, s[sgprWGM], 0x16 // Get CU_Count +s_lshr_b32 s72, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s72, s72 // Get log(WGMXCC) +s_lshr_b32 s73, s[sgprWGM], 0x16 // Get CU_Count /* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ -s_cmp_gt_i32 s80, 0 +s_cmp_gt_i32 s72, 0 s_cbranch_scc0 label_skip_WGMXCC /* only remap WGs in the range */ -s_lshr_b32 s77, s74, s80 -s_lshl_b32 s77, s77, s80 -s_cmp_ge_u32 s[sgprWorkGroup0], s77 +s_lshr_b32 s69, s67, s72 +s_lshl_b32 s69, s69, s72 +s_cmp_ge_u32 s[sgprWorkGroup0], s69 s_cbranch_scc1 label_skip_WGMXCC -s_cmp_eq_u32 s81, 0 // CU_Count == 0 ? +s_cmp_eq_u32 s73, 0 // CU_Count == 0 ? s_cbranch_scc0 label_XCCG_nonzero -s_lshr_b32 s77, s[sgprWorkGroup0], s80 -s_bfm_b32 s78, s80, 0 -s_and_b32 s78, s[sgprWorkGroup0], s78 -s_lshr_b32 s79, s74, s80 -s_mul_i32 s78, s78, s79 -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_lshr_b32 s69, s[sgprWorkGroup0], s72 +s_bfm_b32 s70, s72, 0 +s_and_b32 s70, s[sgprWorkGroup0], s70 +s_lshr_b32 s71, s67, s72 +s_mul_i32 s70, s70, s71 +s_add_u32 s[sgprWorkGroup0], s69, s70 s_branch label_skip_WGMXCC label_XCCG_nonzero: /* temp0 = (wg//CU_Count)*CU_Count */ -v_cvt_f32_u32 v4, s81 // wg//CU_Count +v_cvt_f32_u32 v4, s73 // wg//CU_Count v_rcp_iflag_f32 v4, v4 // wg//CU_Count v_cvt_f32_u32 v5, s[sgprWorkGroup0] // wg//CU_Count v_mul_f32 v4, v4, v5 // wg//CU_Count v_cvt_u32_f32 v4, v4 // wg//CU_Count -v_mul_u32_u24 v5, v4, s81 // wg//CU_Count +v_mul_u32_u24 v5, v4, s73 // wg//CU_Count v_sub_u32 v5, s[sgprWorkGroup0], v5 // wg//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // wg//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // wg//CU_Count v_add_u32 v4, 1, v4 // wg//CU_Count v_mov_b32 v5, 0 // wg//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s81 // re-calculate remainder +v_mul_u32_u24 v5, v4, s73 // re-calculate remainder v_sub_u32 v5, s[sgprWorkGroup0], v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s77, v4 // quotient -v_readfirstlane_b32 s78, v5 // remainder -s_mul_i32 s77, s77, s81 +v_readfirstlane_b32 s69, v4 // quotient +v_readfirstlane_b32 s70, v5 // remainder +s_mul_i32 s69, s69, s73 /* temp1 = (wg%CU_Count)//WGMXCC */ -s_lshr_b32 s78, s78, s80 +s_lshr_b32 s70, s70, s72 /* temp0 = temp0 + temp1 */ -s_add_u32 s77, s77, s78 +s_add_u32 s69, s69, s70 /* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ -v_cvt_f32_u32 v4, s81 // WGs//CU_Count +v_cvt_f32_u32 v4, s73 // WGs//CU_Count v_rcp_iflag_f32 v4, v4 // WGs//CU_Count -v_cvt_f32_u32 v5, s74 // WGs//CU_Count +v_cvt_f32_u32 v5, s67 // WGs//CU_Count v_mul_f32 v4, v4, v5 // WGs//CU_Count v_cvt_u32_f32 v4, v4 // WGs//CU_Count -v_mul_u32_u24 v5, v4, s81 // WGs//CU_Count -v_sub_u32 v5, s74, v5 // WGs//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // WGs//CU_Count +v_mul_u32_u24 v5, v4, s73 // WGs//CU_Count +v_sub_u32 v5, s67, v5 // WGs//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // WGs//CU_Count v_add_u32 v4, 1, v4 // WGs//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s78, v4 // quotient -s_mul_i32 s78, s78, s81 -s_sub_u32 s79, s74, s78 -s_cmp_gt_u32 s[sgprWorkGroup0], s78 -s_cselect_b32 s78, s79, s81 -s_lshr_b32 s78, s78, s80 -s_bfm_b32 s79, s80, 0 -s_and_b32 s79, s[sgprWorkGroup0], s79 -s_mul_i32 s78, s78, s79 +v_readfirstlane_b32 s70, v4 // quotient +s_mul_i32 s70, s70, s73 +s_sub_u32 s71, s67, s70 +s_cmp_gt_u32 s[sgprWorkGroup0], s70 +s_cselect_b32 s70, s71, s73 +s_lshr_b32 s70, s70, s72 +s_bfm_b32 s71, s72, 0 +s_and_b32 s71, s[sgprWorkGroup0], s71 +s_mul_i32 s70, s70, s71 /* WorkGroup0 = temp0 + temp1 */ -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_add_u32 s[sgprWorkGroup0], s69, s70 label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap -s_cmp_eq_u32 s72, 0 +s_cmp_eq_u32 s65, 0 s_cbranch_scc0 label_MultiGemm /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -721,97 +677,98 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args s_branch label_MultiGemmEnd label_MultiGemm: /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_IsExternalValid // branch if ArgType == 2 -s_mov_b32 s11, 204 -s_mul_i32 s78, s67, 4 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 176 +s_mul_i32 s72, s64, 4 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] s_branch label_IsExternalValidEnd label_IsExternalValid: -s_mov_b32 s11, 244 -s_mov_b32 s78, 0 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 216 +s_mov_b32 s72, 0 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] label_IsExternalValidEnd: /* Grouped Gemm:: prefetch 1 arg load */ s_mov_b32 s10, 1 -s_mov_b32 s79, 0 -s_load_dwordx4 s[20:23], s[72:73], s78 -s_cmpk_eq_u32 s67, 1 // if gemm_count is 1? +s_mov_b32 s73, 0 +s_load_dwordx4 s[20:23], s[66:67], s72 +s_cmpk_eq_u32 s64, 1 // if gemm_count is 1? s_cbranch_scc1 label_wgTable_noLoadLoop /* Grouped Gemm:: accumulate numTiles for each gemm */ /* Grouped Gemm:: loop start */ label_Loop_GemmCount: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 -s_cmp_lt_u32 s[sgprWorkGroup0], s79 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 +s_cmp_lt_u32 s[sgprWorkGroup0], s73 s_cbranch_scc1 label_FOUND -s_add_u32 s78, s78, s11 -s_load_dwordx4 s[20:23], s[72:73], s78 +s_add_u32 s72, s72, s11 +s_load_dwordx4 s[20:23], s[66:67], s72 s_add_u32 s10, s10, 1 -s_cmp_lt_u32 s10, s67 +s_cmp_lt_u32 s10, s64 s_cbranch_scc1 label_Loop_GemmCount /* Grouped Gemm:: noLoadLoop */ label_wgTable_noLoadLoop: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 /* Grouped Gemm:: gemmIndex found */ label_FOUND: -s_sub_u32 s73, s10, 1 -s_sub_u32 s72, s79, s76 -s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s72 +s_sub_u32 s67, s10, 1 +s_sub_u32 s66, s73, s70 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalStruct // branch if ArgType == 2 /* Grouped Gemm: offset argument address to gemm */ /* Grouped Gemm: offset address from wg_table_start to args_start */ -s_lshl2_add_u32 s[sgprKernArgAddress], s67, s[sgprKernArgAddress] +s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress] s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 204 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 176 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_branch label_LoadExternalStructEnd label_LoadExternalStruct: /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 244 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 216 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dword s56, s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 // Read Beta -s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 160 // 160 +s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132 label_LoadExternalStructEnd: /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -841,7 +798,7 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args /* Early stop if N(SizeFreeJ) == 0 */ s_cmp_eq_u32 s[sgprSizeJ], 0 @@ -851,25 +808,17 @@ s_endpgm label_NoEarlyStop_N0: label_MultiGemmEnd: -.set sgprSrdA, 72 -.set sgprSrdB, 76 -.set sgprShadowLimitA, 80 -.set sgprShadowLimitB, 82 -.set sgprStaggerUIter, 67 -.set sgprWrapUA, sgprKernArgAddress -.set sgprWrapUB, 84 -.set sgprGlobalReadIncsA, 86 -.set sgprGlobalReadIncsB, 87 -.set sgprScalarGlobalReadOffsetA, 88 -.set sgprScalarGlobalReadOffsetB, 95 - -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 +.set sgprSrdA, 64 +.set sgprSrdB, 68 +.set sgprShadowLimitA, 72 +.set sgprShadowLimitB, 74 +.set sgprStaggerUIter, 76 +.set sgprWrapUA, 77 +.set sgprWrapUB, 79 +.set sgprGlobalReadIncsA, 81 +.set sgprGlobalReadIncsB, 82 +.set sgprScalarGlobalReadOffsetA, 83 +.set sgprScalarGlobalReadOffsetB, 90 s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift @@ -883,28 +832,30 @@ label_AlphaNonZero: s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0] // Save original StreamK index s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do) s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do) -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_cmp_lt_u32 s[sgpr104], s[sgprTotalIters] // Check if there are DP tiles to do +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_cmp_lt_u32 s97, s[sgprTotalIters] // Check if there are DP tiles to do s_cbranch_scc1 label_SK_InitDone // Done init s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr105], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr104], s[sgprStreamKIdx], s[sgpr105] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr105], s[sgpr104], s[sgpr105] // StreamK ending iteration (case: before extra iters) +s_add_u32 s98, s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s97, s[sgprStreamKIdx], s98 // StreamK starting iteration (case: before extra iters) +s_add_u32 s98, s97, s98 // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr104], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr105], s[sgprStreamKIterEnd] // Set end iter -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr104] // Cap ending iter at total SK iters +s_cselect_b32 s[sgprStreamKIter], s97, s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s98, s[sgprStreamKIterEnd] // Set end iter +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s97 // Cap ending iter at total SK iters label_SK_InitDone: s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc0 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_KernelEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_KernelEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_KernelEnd label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ @@ -913,14 +864,10 @@ label_NoBranch_T8JHFHKM7BO5OHXW: label_PersistentLoopStart: // Use sgprScalarGlobalReadOffsetA/B sgprs -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - +.set sgpr102, 84 +.set sgpr103, 85 +.set sgpr104, 86 + /******************************************/ /* Begin setupNewTile */ /******************************************/ @@ -938,78 +885,106 @@ v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v4 // Set LRA to first b v_xor_b32 v4, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v4 // Set LRA to first buffer offset /* StreamK calculate tile idx and map to WG */ -s_mul_hi_u32 s[sgpr105], s[sgprStreamKIter], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s[sgpr104], s[sgprStreamKIter], s[sgpr106] // s_magic mul, div alg 2 -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_and_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr104], s[sgpr104], s[sgpr106] // sMagicDiv Alg 2 -s_mul_i32 s[sgpr105], s[sgpr104], s[sgprItersPerTile] // Tile start iteration -s_add_u32 s[sgpr106], s[sgpr105], s[sgprItersPerTile] // Tile end iteration -s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s[sgpr105] // Local iteration start -s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s[sgpr106] // 1. (Local) iteration end (SK tile) -s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s[sgpr105] // 2. Local iteration end (SK tile) -s_mul_i32 s[sgpr107], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_sub_u32 s[sgpr107], s[sgprTotalIters], s[sgpr107] // Offset to first SK tile -s_mul_i32 s[sgpr105], s[sgprskGrid], s[sgprItersPerTile] // DP iterations shift -s_add_u32 s[sgpr105], s[sgpr105], s[sgprStreamKIter] // Add DP shift -s_cmp_lt_u32 s[sgpr105], s[sgpr107] // Check if still in DP section +v_cvt_f32_u32 v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_rcp_iflag_f32 v4, v4 // StreamKIter // ItersPerTile +v_cvt_f32_u32 v5, s[sgprStreamKIter] // StreamKIter // ItersPerTile +v_mul_f32 v4, v4, v5 // StreamKIter // ItersPerTile +v_cvt_u32_f32 v4, v4 // StreamKIter // ItersPerTile +v_mul_u32_u24 v5, v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_sub_u32 v5, s[sgprStreamKIter], v5 // StreamKIter // ItersPerTile +v_cmpx_eq_u32 exec, v5, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_add_u32 v4, 1, v4 // StreamKIter // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s98, s[sgprItersPerTile] // Tile start iteration +s_add_u32 s100, s99, s[sgprItersPerTile] // Tile end iteration +s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s99 // Local iteration start +s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s100 // 1. (Local) iteration end (SK tile) +s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s99 // 2. Local iteration end (SK tile) +s_and_b32 s101, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s101, s101, s[sgprItersPerTile] // Total SK iters +s_sub_u32 s101, s[sgprTotalIters], s101 // Offset to first SK tile +s_lshr_b32 s99, s[sgprskGridAndTiles], 0x10 // Get skGrid +s_mul_i32 s99, s99, s[sgprItersPerTile] // DP iterations shift +s_add_u32 s99, s99, s[sgprStreamKIter] // Add DP shift +s_cmp_lt_u32 s99, s101 // Check if still in DP section s_cbranch_scc1 label_SK_UpdateDone // Done update -s_mov_b32 s[sgpr105], s[sgpr106] // SK iterations shift -s_cmp_le_u32 s[sgpr107], s[sgprStreamKIter] // Check if continuing in SK section +s_mov_b32 s99, s100 // SK iterations shift +s_cmp_le_u32 s101, s[sgprStreamKIter] // Check if continuing in SK section s_cbranch_scc1 label_SK_UpdateDone // Done update s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr109], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr108], s[sgprStreamKIdx], s[sgpr109] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr109], s[sgpr108], s[sgpr109] // StreamK ending iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s[sgpr102], s[sgprStreamKIdx], s[sgpr103] // StreamK starting iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgpr102], s[sgpr103] // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr108], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr109], s[sgprStreamKIterEnd] // Set end iter -s_add_u32 s[sgpr105], s[sgprStreamKIter], s[sgpr107] // Offset to start of SK section -s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr107] // Offset to start of SK section +s_cselect_b32 s[sgprStreamKIter], s[sgpr102], s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr103], s[sgprStreamKIterEnd] // Set end iter +s_add_u32 s99, s[sgprStreamKIter], s101 // Offset to start of SK section +s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s101 // Offset to start of SK section s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_KernelEnd +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_KernelEnd, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_KernelEnd label_NoBranch_S4FDBQ587JJL6NOU: label_SK_UpdateDone: -s_mov_b32 s[sgprStreamKIter], s[sgpr105] // Store current iteration +s_mov_b32 s[sgprStreamKIter], s99 // Store current iteration /* Map StreamK tile index to wg0/1/2 */ -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumProblemNumGroupTiles0By1] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup2], s[sgpr105] // wg2 = Tile Idx / problemNumGroupTiles0By1 -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups1] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgpr104], s[sgpr104], s[sgpr105] // remainder -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumberProblemNumGroupTiles0] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup1], s[sgpr105] // wg1 = Tile Idx / problemNumGroupTiles0 -s_mul_i32 s[sgprWorkGroup0], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr104], s[sgprWorkGroup0] // wg0 = Tile Idx % problemNumGroupTiles0 +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles +v_cvt_f32_u32 v4, s99 // TileID // nWG0*nWG1 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0*nWG1 +v_cvt_f32_u32 v5, s98 // TileID // nWG0*nWG1 +v_mul_f32 v4, v4, v5 // TileID // nWG0*nWG1 +v_cvt_u32_f32 v4, v4 // TileID // nWG0*nWG1 +v_mul_u32_u24 v5, v4, s99 // TileID // nWG0*nWG1 +v_sub_u32 v5, s98, v5 // TileID // nWG0*nWG1 +v_cmpx_eq_u32 exec, v5, s99 // TileID // nWG0*nWG1 +v_add_u32 v4, 1, v4 // TileID // nWG0*nWG1 +v_mov_b32 v5, 0 // TileID // nWG0*nWG1 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s99 // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s99 // re-calculate remainder +v_sub_u32 v5, s98, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup2], v4 // quotient +v_readfirstlane_b32 s100, v5 // remainder +v_cvt_f32_u32 v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0 +v_cvt_f32_u32 v5, s100 // TileID // nWG0 +v_mul_f32 v4, v4, v5 // TileID // nWG0 +v_cvt_u32_f32 v4, v4 // TileID // nWG0 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_sub_u32 v5, s100, v5 // TileID // nWG0 +v_cmpx_eq_u32 exec, v5, s[sgprNumWorkGroups0] // TileID // nWG0 +v_add_u32 v4, 1, v4 // TileID // nWG0 +v_mov_b32 v5, 0 // TileID // nWG0 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck // branch if s[Alpha] != 0 s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_GW_End, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_GW_End +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_GW_End, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_GW_End label_NoBranch_UR8VN3A1SJCPC6PO: s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations label_SKAlphaCheck: @@ -1018,130 +993,130 @@ s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? s_cbranch_scc1 label_WGM // branch if WGM >= 0 -s_abs_i32 s[sgpr108], s[sgprWGM] // abs(WGM) -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_abs_i32 s101, s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup0], s[sgpr107] // WorkGroup0=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup0], s100 // WorkGroup0=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s100, s100, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups0], s[sgpr105] // NumWorkGroups0=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups0], s99 // NumWorkGroups0=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup1] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup1] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder -s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr107], s[sgprWorkGroup0] // WorkGroup0=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s100, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s97 // wg1 += blockId * WGM s_branch label_WGM label_WGMPositive: -s_mov_b32 s[sgpr108], s[sgprWGM] // WGM -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_mov_b32 s101, s[sgprWGM] // WGM +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup1], s[sgpr107] // WorkGroup1=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup1], s100 // WorkGroup1=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s100, s100, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups1], s[sgpr105] // NumWorkGroups1=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups1], s99 // NumWorkGroups1=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup0] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup0] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup0], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup1], v5 // remainder -s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup1], s[sgpr107], s[sgprWorkGroup1] // WorkGroup1=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s100, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s97 // wg1 += blockId * WGM label_WGM: /******************************************/ @@ -1175,8 +1150,8 @@ v_lshl_add_u32 v5, v7, 14, v5 // 7. wave offset in M dimen: /* local read addresses: final offsets a */ v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id -s_mov_b32 s[sgpr104], 128 // LSU offset: stride = lsuStride(128) when umlds==True -v_mul_lo_u32 v6, s[sgpr104], v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +s_mov_b32 s97, 128 // LSU offset: stride = lsuStride(128) when umlds==True +v_mul_lo_u32 v6, s97, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) v_add_u32 v[vgprLocalReadAddrA], v6, v4 // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1) v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 32 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 5, v[vgprLocalReadAddrA] // Final Offset: padding 32 per block 1024 @@ -1185,7 +1160,7 @@ v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 5, v[vgprLocalReadAddrA] // Final Offs v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id // LSU offset: stride = lsuStride(128) when umlds==True (dup assign opt.) -v_mul_lo_u32 v4, s[sgpr104], v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_mul_lo_u32 v4, s97, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) v_add_u32 v[vgprLocalReadAddrB], v4, v5 // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1) v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 32 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 5, v[vgprLocalReadAddrB] // Final Offset: padding 32 per block 1024 @@ -1300,108 +1275,80 @@ s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute off s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) // scalar offset *= bytes/element (multiplier is 1, do nothing) -// Use sgprScalarGlobalReadOffsetA sgprs -.set sgpr104, sgprSKItersPerWG // skitersperwg, overwrite, 54 -.set sgpr105, sgprskGrid // skgrid, overwrite, 55 -.set sgpr106, sgprMagicNumberProblemNumGroupTiles0 // sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgpr107, sgprMagicShiftProblemNumGroupTiles0 // sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgpr108, sgprMagicShiftItersPerTile // sgprMagicShiftItersPerTile, 50 -.set sgpr109, sgprMagicNumProblemNumGroupTiles0By1 // sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgpr110, sgprWGM // wgm, 7 - -// Save sgpr values to vgpr -v_writelane_b32 v255, s[sgprSKItersPerWG], 0 -s_nop 0 -v_writelane_b32 v255, s[sgprskGrid], 1 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumberProblemNumGroupTiles0], 2 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftProblemNumGroupTiles0], 3 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftItersPerTile], 4 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumProblemNumGroupTiles0By1], 5 -s_nop 0 -v_writelane_b32 v255, s[sgprWGM], 6 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress], 7 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress+1], 8 - /* global read addresses: addresses a */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideAL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideAL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideAL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideAL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideAL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideAL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeI], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideAL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideAL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideA0I], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideA0I], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s101 // sub tileStart // Set limit to use bytes (byte is 1, do nothing) s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart +s_mul_hi_u32 s99, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart // tileStart *= BPE (multiplier is 1, do nothing) -s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: addresses b */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideBL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideBL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideBL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideBL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideBL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideBL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeJ], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideBL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideBL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideB1J], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideB1J], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s101 // sub tileStart // Set limit to use bytes (byte is 1, do nothing) s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart +s_mul_hi_u32 s99, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart // tileStart *= BPE (multiplier is 1, do nothing) -s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: increments a */ @@ -1415,87 +1362,87 @@ v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck2 // branch if s[Alpha] != 0 s_mov_b32 s[sgprLoopCounterL], 0 // Skip iterations label_SKAlphaCheck2: -s_and_b32 s[sgpr105], 127, s[sgprSizesSum+0] // s[sgpr105] = s[sgprSizesSum+0] % 128 -s_cmp_eq_u32 s[sgpr105], 0 // numIterL == 0 -s_cselect_b32 s[sgpr104], 0, 1 // check if size uses tail loop +s_and_b32 s99, 127, s[sgprSizesSum+0] // s99 = s[sgprSizesSum+0] % 128 +s_cmp_eq_u32 s99, 0 // numIterL == 0 +s_cselect_b32 s98, 0, 1 // check if size uses tail loop s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile -s_cselect_b32 s[sgpr104], s[sgpr104], 0 // this WG runs tail loop -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s[sgpr104] // Adjust loop counter for tail loop +s_cselect_b32 s98, s98, 0 // this WG runs tail loop +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s98 // Adjust loop counter for tail loop s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter -s_and_b32 s[sgpr106], s[sgprStaggerU], 0x1f00 -s_lshr_b32 s[sgpr106], s[sgpr106], 0x8 -s_and_b32 s[sgpr107], s[sgprStaggerU], 0xe000 +s_and_b32 s100, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s100, s100, 0x8 +s_and_b32 s101, s[sgprStaggerU], 0xe000 s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff -s_mov_b32 s[sgpr104], s[sgprStaggerU] // init staggerU +s_mov_b32 s98, s[sgprStaggerU] // init staggerU label_beginStaggerUIter: -s_lshl_b32 s[sgpr105], s[sgpr104], s[sgpr106] // shift by StaggerUStride -s_cmp_ge_u32 s[sgprOrigLoopCounter], s[sgpr105] // loopCount >= current shift Count +s_lshl_b32 s99, s98, s100 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s99 // loopCount >= current shift Count s_cbranch_scc1 label_endStaggerUIter // jump to end -s_lshr_b32 s[sgpr104], s[sgpr104], 1 // step down to smaller stagger +s_lshr_b32 s98, s98, 1 // step down to smaller stagger s_branch label_beginStaggerUIter // jump to begin label_endStaggerUIter: -s_sub_u32 s[sgpr105], s[sgpr104], 1 // staggerU mask -s_cmp_ge_u32 s[sgpr104], 1 // if current staggerU >= 1 -s_cselect_b32 s[sgprStaggerUIter], s[sgpr105], 0 // set Mask -s_cmp_eq_u32 s[sgpr107], 0x0 +s_sub_u32 s99, s98, 1 // staggerU mask +s_cmp_ge_u32 s98, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s99, 0 // set Mask +s_cmp_eq_u32 s101, 0x0 s_cbranch_scc1 label_StaggerUMapping_1 -s_mov_b32 s[sgpr104], s[sgprWorkGroup0] +s_mov_b32 s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_1: -s_cmp_eq_u32 s[sgpr107], 0x2000 +s_cmp_eq_u32 s101, 0x2000 s_cbranch_scc1 label_StaggerUMapping_2 -s_mov_b32 s[sgpr104], s[sgprWorkGroup1] +s_mov_b32 s98, s[sgprWorkGroup1] s_branch label_staggerInputEnd label_StaggerUMapping_2: -s_cmp_eq_u32 s[sgpr107], 0x4000 +s_cmp_eq_u32 s101, 0x4000 s_cbranch_scc1 label_StaggerUMapping_3 -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_StaggerUMapping_3: -s_cmp_eq_u32 s[sgpr107], 0x6000 +s_cmp_eq_u32 s101, 0x6000 s_cbranch_scc1 label_StaggerUMapping_4 -s_mul_i32 s[sgpr105], s[sgprNumWorkGroups0], s[sgprWorkGroup1] -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_add_u32 s[sgpr104], s[sgpr104], s[sgprWorkGroup0] +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s98, s98, s99 +s_add_u32 s98, s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_4: -s_cmp_eq_u32 s[sgpr107], 0x8000 +s_cmp_eq_u32 s101, 0x8000 s_cbranch_scc1 label_staggerInputEnd -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_staggerInputEnd: -s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr104] // Compute actual stagger start for this tile -s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr106] // shift by StaggerUStride +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s98 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s100 // shift by StaggerUStride s_cmp_gt_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles /* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap @@ -1545,26 +1492,26 @@ s_add_u32 m0, m0, 4224 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 /* global read inc A loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* global read inc B loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -1581,28 +1528,28 @@ s_mov_b32 s[sgprSrdC+2], BufferOOB s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s[sgpr106], MT1, s[sgprWorkGroup1] // <- wg1*MT1 -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s[sgpr105] // add hi to SRD - -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgpr105] // add hi to SRD +s_mul_i32 s100, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s99, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s99 // add hi to SRD + +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s99 // add hi to SRD /* initC: remove ValuC vgpr buffer [0...0) from pool */ @@ -1870,11 +1817,11 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? /* after InitC, skip to end of prefetch last iter if numIter==0 */ s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU // Only branch on scc1 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_PrefetchGlobalLastIterEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_PrefetchGlobalLastIterEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_PrefetchGlobalLastIterEnd label_NoBranch_8S4L1KCK9VFC7AQU: s_waitcnt vmcnt(0) // wait for global read s_barrier // For stream-k / persistent loop @@ -1924,6 +1871,7 @@ s_add_u32 m0, m0, 4224 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0 s_add_u32 m0, m0, 4224 // Move LDS write address to next line buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 + /* local write swap a */ s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR @@ -1954,7 +1902,7 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0 s_waitcnt lgkmcnt(0) - + /******************************************/ /* Unrolled Loop(s) - Begin */ /******************************************/ @@ -1995,15 +1943,15 @@ ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] o v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:0 blgp:0 // left value = acc[32+0:35+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) /* mfmaIndex:9 */ v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:0 blgp:0 // left value = acc[36+0:39+0] -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address @@ -2018,10 +1966,10 @@ s_barrier v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:0 blgp:0 // left value = acc[44+0:47+0] /* global read inc B loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:16 */ v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:0 blgp:0 // left value = acc[64+0:67+0] @@ -2078,8 +2026,8 @@ v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB /* mfmaIndex:6 */ v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:0 blgp:0 // left value = acc[24+0:27+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2332,15 +2280,15 @@ v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:0 blgp:0 // left value = acc[72+0:75+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) /* mfmaIndex:19 */ v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:0 blgp:0 // left value = acc[76+0:79+0] -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2348,15 +2296,15 @@ s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow t v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:0 blgp:0 // left value = acc[96+0:99+0] /* global read inc B loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:25 */ v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:0 blgp:0 // left value = acc[100+0:103+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -2762,12 +2710,12 @@ label_PrefetchGlobalLastIterEnd: /******************************************/ /* local write reset offsets a */ -s_xor_b32 s[sgpr104], s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s97 // Set LWA to first buffer offset /* local write reset offsets b */ -s_xor_b32 s[sgpr104], s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s97 // Set LWA to first buffer offset /* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */ .set vgprG2LA_BASE, 4 .set vgprG2LA, vgprG2LA_BASE+0 @@ -2786,57 +2734,56 @@ s_mov_b32 s[sgprOrigLoopCounter], 0 // repurpose to count each lo s_cbranch_scc1 label_SkipTailLoopL // skip to end of tail loop b/c numIter==0 /* remove stagger offsets for tail loop */ -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL label_Negative_J5DQFVGFWLXU2DUR: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_DLSAQLEVYLOBCPNL: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUA] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUA+1] // S - WrapU -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUA] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUA+1] // S - WrapU +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1 -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB label_Negative_LQI6BOBE0EY8XIP1: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_9N1QELR2XL4Z0HRB: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUB] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUB+1] // S - WrapU -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUB] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUB+1] // S - WrapU +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 - // Check if K multiple of 4 -s_and_b32 s[sgpr104], s[sgprSizesSum], 3 -s_cmp_eq_u32 s[sgpr104], 0 +s_and_b32 s98, s[sgprSizesSum], 3 +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_tailloop_non_dtl label_tailloop_dtl: @@ -3376,550 +3323,550 @@ ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x80 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 128 (bpeDS) +s_mov_b32 s97, 0x80 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 128 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 4, v135 // v135 = v135 * 16 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 4, v135 // v135 = v135 * 16 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 31 // get inputs for edge thread -s_sub_u32 s[sgpr106], 32, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 3 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 31 // get inputs for edge thread +s_sub_u32 s97, 32, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 3 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v145, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1] -v_lshlrev_b64 v[142:143], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1] -v_lshlrev_b64 v[144:145], s[sgpr106], v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v145, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1] +v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1] +v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1] v_add_u32 v136, v135, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v141, s[98:99] v_add_u32 v136, v136, 56 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v142, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v143, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v142, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v143, s[98:99] v_add_u32 v136, v136, 8 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v144, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v145, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v144, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v145, s[98:99] s_nop 1 v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:0 blgp:0 // left value = acc[0+0:3+0] v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:0 blgp:0 // left value = acc[4+0:7+0] @@ -3992,12 +3939,12 @@ s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x80 // inc counterL s_cmp_le_i32 s[sgprLoopCounterL], 0x0 // counterL<=0 s_cbranch_scc0 label_TailLoopBeginL // restart LoopL label_TailLoopEndL: -s_mov_b32 s[sgpr104], 1 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgpr104] // remove lro damage -s_mov_b32 s[sgpr104], 1 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgpr104] // remove lro damage +s_mov_b32 s97, 1 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s97 // remove lro damage +s_mov_b32 s97, 1 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s97 // remove lro damage label_SkipTailLoopL: .set vgprValuA_X0_I0_BASE, UNDEF .set vgprValuA_X0_I0, UNDEF @@ -4008,11 +3955,11 @@ label_SkipTailLoopL: label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprLoopCounterL, UNDEF .set sgprOrigLoopCounter, UNDEF -.set sgprStaggerUIter, UNDEF .set sgprSrdA, UNDEF .set sgprSrdB, UNDEF .set sgprShadowLimitA, UNDEF .set sgprShadowLimitB, UNDEF +.set sgprStaggerUIter, UNDEF .set sgprWrapUA, UNDEF .set sgprWrapUB, UNDEF .set sgprGlobalReadIncsA, UNDEF @@ -4020,58 +3967,31 @@ label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprScalarGlobalReadOffsetA, UNDEF .set sgprScalarGlobalReadOffsetB, UNDEF /* load store sgprs */ -.set sgprAddressScaleA, 72 -.set sgprAddressScaleB, 74 -.set sgprAddressScaleAlphaVec, 76 -.set sgprAddressBias, 78 -.set sgprBiasType, 80 -.set sgprBiasStride, 81 -.set sgpractivationAlpha, 82 -.set sgpractivationBeta, 83 -.set sgprActivationType, 84 - -v_readlane_b32 s[sgprSKItersPerWG], v255, 0 -s_nop 0 -v_readlane_b32 s[sgprskGrid], v255, 1 -s_nop 0 -v_readlane_b32 s[sgprMagicNumberProblemNumGroupTiles0], v255, 2 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftProblemNumGroupTiles0], v255, 3 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftItersPerTile], v255, 4 -s_nop 0 -v_readlane_b32 s[sgprMagicNumProblemNumGroupTiles0By1], v255, 5 -s_nop 0 -v_readlane_b32 s[sgprWGM], v255, 6 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress], v255, 7 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress+1], v255, 8 - -.set sgpr104, UNDEF -.set sgpr105, UNDEF -.set sgpr106, UNDEF -.set sgpr107, UNDEF -.set sgpr108, UNDEF -.set sgpr109, UNDEF -.set sgpr110, UNDEF - +.set sgprAddressScaleA, 64 +.set sgprAddressScaleB, 66 +.set sgprAddressScaleAlphaVec, 68 +.set sgprAddressBias, 70 +.set sgprBiasType, 72 +.set sgprBiasStride, 73 +.set sgpractivationAlpha, 74 +.set sgpractivationBeta, 75 +.set sgprActivationType, 76 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalEpilogueStruct // branch if ArgType == 2 -s_load_dwordx8 s[72:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 152 // 152 -s_load_dwordx4 s[80:83], s[sgprKernArgAddress:sgprKernArgAddress+1], 184 // 184 -s_load_dword s84, s[sgprKernArgAddress:sgprKernArgAddress+1], 200 // 200 +s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124 +s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156 +s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 172 // 172 s_branch label_LoadExternalEpilogueStructEnd label_LoadExternalEpilogueStruct: -s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 176 // 176 -s_load_dwordx4 s[76:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 208 // 208 -s_load_dwordx2 s[80:81], s[sgprKernArgAddress:sgprKernArgAddress+1], 224 // 224 -s_load_dwordx2 s[82:83], s[sgprKernArgAddress:sgprKernArgAddress+1], 248 // 248 -s_load_dword s84, s[sgprKernArgAddress:sgprKernArgAddress+1], 256 // 256 +s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 148 // 148 +s_load_dwordx4 s[68:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180 +s_load_dwordx2 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196 +s_load_dwordx2 s[74:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220 +s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228 label_LoadExternalEpilogueStructEnd: -.set sgprSrdScaleAlphaVec, 88 -.set sgprSrdBias, 92 +.set sgprSrdScaleAlphaVec, 80 +.set sgprSrdBias, 84 /* Mapping of Acc register -> C Vgpr register */ @@ -4122,10 +4042,10 @@ s_mov_b32 s[sgprSrdScaleAlphaVec+2], s[sgprSizeI] label_ScaleAlphaVecAddrValid_End: s_mul_i32 s[sgprSrdScaleAlphaVec+2], 0x4, s[sgprSrdScaleAlphaVec+2] // ScaleAlphaVec scaled by BPE -s_add_u32 s67, s[sgprWorkGroup2], 0x1 -s_mul_i32 s67, s[sgprBiasStride], s67 // stride * (wg+1) -s_cmp_eq_u32 s67, 0 // bias stride = 0? -s_cselect_b32 s67, s[sgprSizeI], s67 +s_add_u32 s77, s[sgprWorkGroup2], 0x1 +s_mul_i32 s77, s[sgprBiasStride], s77 // stride * (wg+1) +s_cmp_eq_u32 s77, 0 // bias stride = 0? +s_cselect_b32 s77, s[sgprSizeI], s77 s_mov_b64 s[sgprSrdBias+0:sgprSrdBias+0+1], s[sgprAddressBias+0:sgprAddressBias+0+1] // init SRD base address s_mov_b32 s[sgprSrdBias+3], Srd127_96 // Set bits 127_96 in post-loop SRD s_cmp_eq_u64 s[sgprAddressBias:sgprAddressBias+1], 0 // s[AddressBias] == 0 ? @@ -4133,7 +4053,7 @@ s_cbranch_scc0 label_BiasAddrValid // branch if s[AddressBias] ! s_mov_b32 s[sgprSrdBias+2], 0 s_branch label_BiasAddrValid_End label_BiasAddrValid: -s_mov_b32 s[sgprSrdBias+2], s67 +s_mov_b32 s[sgprSrdBias+2], s77 label_BiasAddrValid_End: label_Load_Biasf32_0: @@ -4143,15 +4063,15 @@ s_cbranch_scc1 label_Load_Biasbf16_0 // Branch if true /******************************************/ /* Read vector to LDS */ /******************************************/ -s_mul_i32 s67, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v8, s67, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v8, s77, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset s_mul_i32 s[sgprSrdBias+2], 0x4, s[sgprSrdBias+2] // scaled by BPE -s_mul_i32 s67, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG -v_add_u32 v6, s67, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG +s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG +v_add_u32 v6, s77, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG v_lshlrev_b32 v6, 0x2, v6 // Global bias address scaled by BPE v_lshlrev_b32 v7, 0x2, v8 // Global scaleAlpha address scaled by BPE -s_mul_i32 s67, 256, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v8, s67, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v8, s77, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset buffer_load_dword v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec v_lshlrev_b32 v8, 0x2, v[vgprSerial] // Local address scaled by BPE @@ -4170,15 +4090,15 @@ s_cbranch_scc1 label_Load_Bias_End // Branch if true /******************************************/ /* Read vector to LDS */ /******************************************/ -s_mul_i32 s67, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v8, s67, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v8, s77, v[vgprSerial] // coord 0 = wgp0 * MT0 + thread offset s_mul_i32 s[sgprSrdBias+2], 0x2, s[sgprSrdBias+2] // scaled by BPE -s_mul_i32 s67, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG -v_add_u32 v6, s67, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG +s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG +v_add_u32 v6, s77, v8 // coord 0 = wgp0 * MT0 + thread offset + Stride * WG v_lshlrev_b32 v6, 0x1, v6 // Global bias address scaled by BPE v_lshlrev_b32 v7, 0x2, v8 // Global scaleAlpha address scaled by BPE -s_mul_i32 s67, 256, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v8, s67, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset +s_mul_i32 s77, 256, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v8, s77, v[vgprSerial] // coord 1 = wgp1 * MT1 + thread offset buffer_load_short_d16 v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec v_lshlrev_b32 v8, 0x2, v[vgprSerial] // Local address scaled by BPE @@ -4201,38 +4121,48 @@ s_waitcnt lgkmcnt(0) // wait for scaleAB load v_mul_f32 v4, v4, s8 v_mul_f32 v4, v4, s9 s_nop 0 // 1 wait states -s_mov_b32 s67, s[sgprAlpha] // Save alpha value +s_mov_b32 s64, s[sgprAlpha] // Save alpha value v_readfirstlane_b32 s[sgprAlpha], v4 // Update Alpha s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23 // Only branch on scc0 -s_getpc_b64 s[86:87] // addr of next instr -s_add_i32 s88, label_SK_Partials, 4 // target branch offset -s_add_u32 s86, s86, s88 // add target branch offset -s_addc_u32 s87, s87, 0 // add high and carry -s_setpc_b64 s[86:87] // branch to label_SK_Partials +s_getpc_b64 s[88:89] // addr of next instr +s_add_i32 s90, label_SK_Partials, 4 // target branch offset +s_add_u32 s88, s88, s90 // add target branch offset +s_addc_u32 s89, s89, 0 // add high and carry +s_setpc_b64 s[88:89] // branch to label_SK_Partials label_NoBranch_QWMA7J3AUDGL0X23: s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cbranch_scc1 label_SK_Store // Branch if started and finished tile, go to regular store code -s_add_u32 s85, s[sgprStreamKIdx], 1 // input partial tile index -s_mul_hi_u32 s75, s[sgprStreamKIterEnd], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s76, s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s74, s[sgprStreamKIterEnd], s76 // s_magic mul, div alg 2 -s_add_u32 s74, s74, s75 -s_and_b32 s76, s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s74, s74, s76 // sMagicDiv Alg 2 -s_mul_i32 s74, s74, s[sgprItersPerTile] // start iteration of partial tile -s_sub_u32 s86, s[sgprStreamKIterEnd], s74 // calc iterations completed by this WG +s_add_u32 s65, s[sgprStreamKIdx], 1 // input partial tile index +v_cvt_f32_u32 v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_rcp_iflag_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_cvt_f32_u32 v18, s[sgprStreamKIterEnd] // StreamKIterEnd // ItersPerTile +v_mul_f32 v17, v17, v18 // StreamKIterEnd // ItersPerTile +v_cvt_u32_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // StreamKIterEnd // ItersPerTile +v_cmpx_eq_u32 exec, v18, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_add_u32 v17, 1, v17 // StreamKIterEnd // ItersPerTile +v_mov_b32 v18, 0 // StreamKIterEnd // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v18, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v17, v17, 1 // quotient - 1 +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // re-calculate remainder +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s77, v17 // quotient +v_readfirstlane_b32 s68, v18 // remainder label_SK_Fixup: -s_lshl_b32 s74, s85, 2 // flag offset based on CTA index -s_load_dword s76, s[sgprAddressFlags:sgprAddressFlags+1], s74 glc // get flag +s_lshl_b32 s77, s65, 2 // flag offset based on CTA index +s_load_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // get flag s_waitcnt lgkmcnt(0) // wait for flag load -s_cmp_eq_u32 s76, 1 // check if ready +s_cmp_eq_u32 s79, 1 // check if ready s_cbranch_scc0 label_SK_Fixup // if flag not set, wait and check again s_barrier // wait for all workgroups before resetting flag -v_readfirstlane_b32 s76, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s76, 0 // Check for wave 0 +v_readfirstlane_b32 s79, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s79, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagReset // Skip flag reset -s_store_dword s76, s[sgprAddressFlags:sgprAddressFlags+1], s74 glc // reset flag +s_store_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // reset flag label_SK_SkipFlagReset: label_Fixup_E0: @@ -4241,8 +4171,8 @@ s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // s_mov_b32 s[sgprSrdWS+2], BufferOOB s_mov_b32 s[sgprSrdWS+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s74, 0x40000, s85 // Offset to correct partials tile -s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s74 // add lo to SRD +s_mul_i32 s78, 0x40000, s65 // Offset to correct partials tile +s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s78 // add lo to SRD s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ @@ -4253,45 +4183,45 @@ s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* calc coords, apply mask, and issue loads (if necessary) */ v_lshlrev_b32 v18, 5, v[vgprSerial] // v18 = v[vgprSerial] * 32 -s_mov_b32 s74, 0 // Init sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_mov_b32 s78, 0 // Init sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -4646,45 +4576,45 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc161 // copy acc to vreg[104] v_accvgpr_read_b32 v[vgprValuC+25], acc165 // copy acc to vreg[105] v_accvgpr_read_b32 v[vgprValuC+26], acc169 // copy acc to vreg[106] @@ -5039,24 +4969,24 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[72:75], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[76:79], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[80:83], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[84:87], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS -s_add_u32 s74, s74, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS -buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[72:75], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[76:79], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[80:83], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[84:87], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS +s_add_u32 s78, s78, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS +buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc67 // copy acc to vreg[208] v_accvgpr_read_b32 v[vgprValuC+25], acc71 // copy acc to vreg[209] v_accvgpr_read_b32 v[vgprValuC+26], acc75 // copy acc to vreg[210] @@ -5221,42 +5151,42 @@ v_accvgpr_write_b32 acc251, v[vgprValuC+70] // copy vreg[254] to acc v_accvgpr_write_b32 acc255, v[vgprValuC+71] // copy vreg[255] to acc s_nop 1 // 2 wait states required before reading vgpr s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_add_u32 s74, s[sgprSKItersPerWG], 1 // Add extra iter -s_cmp_lt_u32 s85, s[sgprskExtraIters] // Check if next WG had an extra iteration -s_cselect_b32 s74, s74, s[sgprSKItersPerWG] // Select correct number of iterations for next WG -s_add_u32 s86, s86, s74 // next partial tile iteration -s_add_u32 s85, s85, 1 // next partial tile index -s_cmp_lt_u32 s86, s[sgprItersPerTile] // done loading partial tiles? +s_add_u32 s69, s[sgprSKItersPerWG], 1 // Add extra iter +s_cmp_lt_u32 s65, s[sgprskExtraIters] // Check if next WG had an extra iteration +s_cselect_b32 s69, s69, s[sgprSKItersPerWG] // Select correct number of iterations for next WG +s_add_u32 s68, s68, s69 // next partial tile iteration +s_add_u32 s65, s65, 1 // next partial tile index +s_cmp_lt_u32 s68, s[sgprItersPerTile] // done loading partial tiles? s_cbranch_scc1 label_SK_Fixup // Branch to continue fixup loop label_SK_Store: s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 s_cbranch_scc0 label_GW_Beta // Branch if Beta is not zero -s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? -s_cselect_b32 s74, s74, 0 // set rMT0 -s_cmpk_gt_u32 s74, 0 // rMT0 > 0 +s_and_b32 s78, 255, s[sgprSizeI] // s78 = s[sgprSizeI] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s79 // wg0 >= nwg0-1 ? +s_cselect_b32 s78, s78, 0 // set rMT0 +s_cmpk_gt_u32 s78, 0 // rMT0 > 0 s_cbranch_scc0 label_NoBranch_0MXDW6EW9K7ZNG8F // Only branch on scc1 // jump if edges required -s_getpc_b64 s[74:75] // addr of next instr -s_add_i32 s76, label_GW_B0_E1_M, 4 // target branch offset -s_add_u32 s74, s74, s76 // add target branch offset -s_addc_u32 s75, s75, 0 // add high and carry -s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_M +s_getpc_b64 s[78:79] // addr of next instr +s_add_i32 s80, label_GW_B0_E1_M, 4 // target branch offset +s_add_u32 s78, s78, s80 // add target branch offset +s_addc_u32 s79, s79, 0 // add high and carry +s_setpc_b64 s[78:79] // branch to label_GW_B0_E1_M label_NoBranch_0MXDW6EW9K7ZNG8F: -s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 -s_cselect_b32 s74, s74, 0 // set rMT1 -s_cmpk_gt_u32 s74, 0 // rMT1 > 0 +s_and_b32 s78, 255, s[sgprSizeJ] // s78 = s[sgprSizeJ] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s79 // wg1 >= nwg1-1 +s_cselect_b32 s78, s78, 0 // set rMT1 +s_cmpk_gt_u32 s78, 0 // rMT1 > 0 s_cbranch_scc0 label_NoBranch_IXPKU979JKZCQDH3 // Only branch on scc1 // jump if edges required -s_getpc_b64 s[74:75] // addr of next instr -s_add_i32 s76, label_GW_B0_E1_N, 4 // target branch offset -s_add_u32 s74, s74, s76 // add target branch offset -s_addc_u32 s75, s75, 0 // add high and carry -s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_N +s_getpc_b64 s[78:79] // addr of next instr +s_add_i32 s80, label_GW_B0_E1_N, 4 // target branch offset +s_add_u32 s78, s78, s80 // add target branch offset +s_addc_u32 s79, s79, 0 // add high and carry +s_setpc_b64 s[78:79] // branch to label_GW_B0_E1_N label_NoBranch_IXPKU979JKZCQDH3: label_GW_B0_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -5265,28 +5195,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_0 // Branch if true label_To_Activation_None_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Gelu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Relu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Silu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_5 +label_To_Activation_Clamp_VW8_beta_0_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_ActivationSetPCAddrEnd_5: @@ -5301,8 +5239,8 @@ label_ActivationSetPCAddrEnd_5: /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v0, s74 +s_mul_i32 s68, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v0, s68 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -5431,7 +5369,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -5449,7 +5387,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -5458,8 +5396,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5470,7 +5408,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -5479,8 +5417,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5491,7 +5429,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -5500,8 +5438,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5512,7 +5450,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -5521,8 +5459,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5533,7 +5471,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -5542,8 +5480,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5554,7 +5492,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -5563,8 +5501,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5575,7 +5513,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -5584,8 +5522,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -5722,7 +5660,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -5731,8 +5669,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5743,7 +5681,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -5752,8 +5690,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5764,7 +5702,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -5773,8 +5711,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5785,7 +5723,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -5794,8 +5732,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5806,7 +5744,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -5815,8 +5753,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5827,7 +5765,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -5836,8 +5774,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5848,7 +5786,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -5857,8 +5795,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5869,7 +5807,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -5878,8 +5816,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6016,7 +5954,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6025,8 +5963,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6037,7 +5975,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6046,8 +5984,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6058,7 +5996,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6067,8 +6005,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6079,7 +6017,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6088,8 +6026,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6100,7 +6038,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6109,8 +6047,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6121,7 +6059,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6130,8 +6068,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6142,7 +6080,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6151,8 +6089,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6163,7 +6101,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6172,8 +6110,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6310,7 +6248,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6319,8 +6257,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6331,7 +6269,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6340,8 +6278,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6352,7 +6290,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6361,8 +6299,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6373,7 +6311,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6382,8 +6320,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6394,7 +6332,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6403,8 +6341,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6415,7 +6353,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6424,8 +6362,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6436,7 +6374,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6445,8 +6383,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6457,7 +6395,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6466,8 +6404,8 @@ v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6479,28 +6417,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_1 // Branch if true label_To_Activation_None_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Gelu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Relu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Silu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_4 +label_To_Activation_Clamp_VW8_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_ActivationSetPCAddrEnd_4: @@ -6516,11 +6462,11 @@ label_ActivationSetPCAddrEnd_4: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -6529,105 +6475,105 @@ ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -6740,7 +6686,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -6758,7 +6704,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -6776,7 +6722,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -6794,7 +6740,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -6812,7 +6758,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -6830,7 +6776,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -6848,7 +6794,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -6866,7 +6812,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -6892,116 +6838,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc1 // copy acc to vreg[64] v_accvgpr_read_b32 v[vgprValuC+25], acc5 // copy acc to vreg[65] v_accvgpr_read_b32 v[vgprValuC+26], acc9 // copy acc to vreg[66] @@ -7114,7 +7060,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7132,7 +7078,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7150,7 +7096,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7168,7 +7114,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7186,7 +7132,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7204,7 +7150,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7222,7 +7168,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7240,7 +7186,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -7266,116 +7212,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc2 // copy acc to vreg[128] v_accvgpr_read_b32 v[vgprValuC+25], acc6 // copy acc to vreg[129] v_accvgpr_read_b32 v[vgprValuC+26], acc10 // copy acc to vreg[130] @@ -7488,7 +7434,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7506,7 +7452,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7524,7 +7470,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7542,7 +7488,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7560,7 +7506,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7578,7 +7524,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7596,7 +7542,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7614,7 +7560,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -7640,116 +7586,116 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v0, s78 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v0, s78 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v12, v21, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v21, v12, v21, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v12, v23, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v23, v12, v23, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v0, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v0, s78 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v12, v109, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v109, v12, v109, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v0, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -7862,7 +7808,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -7880,7 +7826,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -7898,7 +7844,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -7916,7 +7862,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -7934,7 +7880,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -7952,7 +7898,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -7970,7 +7916,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -7988,7 +7934,7 @@ v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[80:81], v[4:5] v_mov_b64 v[82:83], v[6:7] v_mov_b64 v[84:85], v[8:9] @@ -8007,28 +7953,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true label_To_Activation_None_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Gelu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Relu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Silu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_3 +label_To_Activation_Clamp_VW1_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_ActivationSetPCAddrEnd_3: @@ -8044,492 +7998,492 @@ label_ActivationSetPCAddrEnd_3: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v0, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v0, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v0, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v0, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v0, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -8608,271 +8562,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -8888,494 +8842,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v0, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v0, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v0, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v0, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v0, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v0, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc180 // copy acc to vreg[45] v_accvgpr_read_b32 v[vgprValuC+18], acc184 // copy acc to vreg[46] v_accvgpr_read_b32 v[vgprValuC+19], acc188 // copy acc to vreg[47] @@ -9454,271 +9408,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -9734,490 +9688,490 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v0, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v0, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v0, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v0, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v0, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc105 // copy acc to vreg[90] v_accvgpr_read_b32 v[vgprValuC+18], acc109 // copy acc to vreg[91] v_accvgpr_read_b32 v[vgprValuC+19], acc113 // copy acc to vreg[92] @@ -10296,271 +10250,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -10576,494 +10530,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v0, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v0, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v0, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v4, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc30 // copy acc to vreg[135] v_accvgpr_read_b32 v[vgprValuC+18], acc34 // copy acc to vreg[136] v_accvgpr_read_b32 v[vgprValuC+19], acc38 // copy acc to vreg[137] @@ -11142,271 +11096,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -11422,494 +11376,494 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v65, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v65, v4, s78 v_lshlrev_b32 v65, 0x2, v65 // Bias address scaled by BPE ds_read_b32 v62, v65 offset:0 // load Bias ds_read_b32 v63, v65 offset:1024 // load scaleAlpha v_add_lshl_u32 v64, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v64, v12, v64, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v64, v12, v64, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v73, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v73, v4, s78 v_lshlrev_b32 v73, 0x2, v73 // Bias address scaled by BPE ds_read_b32 v70, v73 offset:0 // load Bias ds_read_b32 v71, v73 offset:1024 // load scaleAlpha v_add_lshl_u32 v72, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v72, v12, v72, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v72, v12, v72, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v77, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v77, v4, s78 v_lshlrev_b32 v77, 0x2, v77 // Bias address scaled by BPE ds_read_b32 v74, v77 offset:0 // load Bias ds_read_b32 v75, v77 offset:1024 // load scaleAlpha v_add_lshl_u32 v76, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v76, v12, v76, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v76, v12, v76, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v0, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE ds_read_b32 v78, v81 offset:0 // load Bias ds_read_b32 v79, v81 offset:1024 // load scaleAlpha v_add_lshl_u32 v80, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE ds_read_b32 v82, v85 offset:0 // load Bias ds_read_b32 v83, v85 offset:1024 // load scaleAlpha v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE ds_read_b32 v90, v93 offset:0 // load Bias ds_read_b32 v91, v93 offset:1024 // load scaleAlpha v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v4, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v0, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v131, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v131, v4, s78 v_lshlrev_b32 v131, 0x2, v131 // Bias address scaled by BPE v_add_lshl_u32 v130, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v130, v12, v130, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v130, v12, v130, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v0, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v140, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v140, v4, s78 v_lshlrev_b32 v140, 0x2, v140 // Bias address scaled by BPE v_add_lshl_u32 v139, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v139, v12, v139, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v139, v12, v139, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v146, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v146, v4, s78 v_lshlrev_b32 v146, 0x2, v146 // Bias address scaled by BPE v_add_lshl_u32 v145, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v145, v12, v145, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v145, v12, v145, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v150, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v150, v4, s78 v_lshlrev_b32 v150, 0x2, v150 // Bias address scaled by BPE v_add_lshl_u32 v149, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v149, v12, v149, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v149, v12, v149, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v152, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v152, v4, s78 v_lshlrev_b32 v152, 0x2, v152 // Bias address scaled by BPE v_add_lshl_u32 v151, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v151, v12, v151, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v151, v12, v151, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v156, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v156, v4, s78 v_lshlrev_b32 v156, 0x2, v156 // Bias address scaled by BPE v_add_lshl_u32 v155, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v155, v12, v155, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v155, v12, v155, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v158, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v158, v4, s78 v_lshlrev_b32 v158, 0x2, v158 // Bias address scaled by BPE v_add_lshl_u32 v157, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v157, v12, v157, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v157, v12, v157, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v162, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v162, v4, s78 v_lshlrev_b32 v162, 0x2, v162 // Bias address scaled by BPE v_add_lshl_u32 v161, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v161, v12, v161, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v161, v12, v161, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v164, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v164, v4, s78 v_lshlrev_b32 v164, 0x2, v164 // Bias address scaled by BPE v_add_lshl_u32 v163, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v163, v12, v163, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v163, v12, v163, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v168, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v168, v4, s78 v_lshlrev_b32 v168, 0x2, v168 // Bias address scaled by BPE v_add_lshl_u32 v167, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v167, v12, v167, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v167, v12, v167, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v170, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v170, v0, s78 v_lshlrev_b32 v170, 0x2, v170 // Bias address scaled by BPE v_add_lshl_u32 v169, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v169, v12, v169, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v169, v12, v169, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc210 // copy acc to vreg[180] v_accvgpr_read_b32 v[vgprValuC+18], acc214 // copy acc to vreg[181] v_accvgpr_read_b32 v[vgprValuC+19], acc218 // copy acc to vreg[182] @@ -11988,271 +11942,271 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v4, v82, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v4, v86, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v55, v4 v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1 buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v4, v90, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v56, v4 v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1 buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v4, v62, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v57, v4 v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1 buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v4, v66, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v58, v4 v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1 buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v4, v70, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v59, v4 v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1 buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v4, v74, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v60, v4 v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1 buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61] // *= ScaleAlphaVecVMul v_add_f32 v4, v78, v[vgprValuC+61] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v61, v4 v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1 buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -12268,342 +12222,342 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v51, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v51, v4, s78 v_lshlrev_b32 v51, 0x2, v51 // Bias address scaled by BPE ds_read_b32 v48, v51 offset:0 // load Bias ds_read_b32 v49, v51 offset:1024 // load scaleAlpha v_add_lshl_u32 v50, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v50, v12, v50, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v50, v12, v50, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v55, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v55, v4, s78 v_lshlrev_b32 v55, 0x2, v55 // Bias address scaled by BPE ds_read_b32 v52, v55 offset:0 // load Bias ds_read_b32 v53, v55 offset:1024 // load scaleAlpha v_add_lshl_u32 v54, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v54, v12, v54, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v54, v12, v54, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v63, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v63, v4, s78 v_lshlrev_b32 v63, 0x2, v63 // Bias address scaled by BPE ds_read_b32 v60, v63 offset:0 // load Bias ds_read_b32 v61, v63 offset:1024 // load scaleAlpha v_add_lshl_u32 v62, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v62, v12, v62, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v62, v12, v62, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v67, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v67, v4, s78 v_lshlrev_b32 v67, 0x2, v67 // Bias address scaled by BPE ds_read_b32 v64, v67 offset:0 // load Bias ds_read_b32 v65, v67 offset:1024 // load scaleAlpha v_add_lshl_u32 v66, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v66, v12, v66, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v66, v12, v66, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v71, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v71, v4, s78 v_lshlrev_b32 v71, 0x2, v71 // Bias address scaled by BPE ds_read_b32 v68, v71 offset:0 // load Bias ds_read_b32 v69, v71 offset:1024 // load scaleAlpha v_add_lshl_u32 v70, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v70, v12, v70, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v70, v12, v70, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v75, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v75, v4, s78 v_lshlrev_b32 v75, 0x2, v75 // Bias address scaled by BPE ds_read_b32 v72, v75 offset:0 // load Bias ds_read_b32 v73, v75 offset:1024 // load scaleAlpha v_add_lshl_u32 v74, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v74, v12, v74, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v74, v12, v74, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v0, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v81, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v81, v4, s78 v_lshlrev_b32 v81, 0x2, v81 // Bias address scaled by BPE v_add_lshl_u32 v80, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v80, v12, v80, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v80, v12, v80, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v83, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v83, v4, s78 v_lshlrev_b32 v83, 0x2, v83 // Bias address scaled by BPE v_add_lshl_u32 v82, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v82, v12, v82, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v82, v12, v82, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v85, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v85, v4, s78 v_lshlrev_b32 v85, 0x2, v85 // Bias address scaled by BPE v_add_lshl_u32 v84, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v84, v12, v84, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v84, v12, v84, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v87, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v87, v4, s78 v_lshlrev_b32 v87, 0x2, v87 // Bias address scaled by BPE v_add_lshl_u32 v86, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v91, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v91, v4, s78 v_lshlrev_b32 v91, 0x2, v91 // Bias address scaled by BPE v_add_lshl_u32 v90, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v90, v12, v90, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v90, v12, v90, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v95, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v95, v0, s78 v_lshlrev_b32 v95, 0x2, v95 // Bias address scaled by BPE v_add_lshl_u32 v94, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v94, v12, v94, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v94, v12, v94, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v101, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v101, v4, s78 v_lshlrev_b32 v101, 0x2, v101 // Bias address scaled by BPE v_add_lshl_u32 v100, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v100, v12, v100, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v100, v12, v100, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v107, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v107, v4, s78 v_lshlrev_b32 v107, 0x2, v107 // Bias address scaled by BPE v_add_lshl_u32 v106, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v106, v12, v106, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v106, v12, v106, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v0, s86 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v0, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v113, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v113, v4, s78 v_lshlrev_b32 v113, 0x2, v113 // Bias address scaled by BPE v_add_lshl_u32 v112, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v112, v12, v112, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v112, v12, v112, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v119, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v119, v4, s78 v_lshlrev_b32 v119, 0x2, v119 // Bias address scaled by BPE v_add_lshl_u32 v118, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v118, v12, v118, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v118, v12, v118, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v125, v4, s86 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v125, v4, s78 v_lshlrev_b32 v125, 0x2, v125 // Bias address scaled by BPE v_add_lshl_u32 v124, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v124, v12, v124, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v124, v12, v124, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc135 // copy acc to vreg[225] v_accvgpr_read_b32 v[vgprValuC+18], acc139 // copy acc to vreg[226] v_accvgpr_read_b32 v[vgprValuC+19], acc143 // copy acc to vreg[227] @@ -12661,204 +12615,204 @@ v_mov_b32 v15, 0x7fff0000 // fp32 Nan v_mov_b32 v16, 0x7fff // rounding bias for bfloat16 v_mul_f32 v[vgprValuC+17], v49, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v53, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v61, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v65, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v69, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v73, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v77, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v49, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v53, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v61, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v65, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v69, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v73, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v77, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v49, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v53, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v61, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v65, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v69, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v73, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v77, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v4, v76, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v49, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v4, v48, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v53, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v4, v52, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v4, v56, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v61, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v4, v60, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v65, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v4, v64, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v69, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v4, v68, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v73, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v4, v72, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_GW_Beta: -s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? -s_cselect_b32 s74, s74, 0 // set rMT0 -s_cmpk_gt_u32 s74, 0 // rMT0 > 0 +s_and_b32 s78, 255, s[sgprSizeI] // s78 = s[sgprSizeI] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s79 // wg0 >= nwg0-1 ? +s_cselect_b32 s78, s78, 0 // set rMT0 +s_cmpk_gt_u32 s78, 0 // rMT0 > 0 s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required -s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 -s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 -s_cselect_b32 s74, s74, 0 // set rMT1 -s_cmpk_gt_u32 s74, 0 // rMT1 > 0 +s_and_b32 s78, 255, s[sgprSizeJ] // s78 = s[sgprSizeJ] % 256 +s_add_u32 s79, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s79 // wg1 >= nwg1-1 +s_cselect_b32 s78, s78, 0 // set rMT1 +s_cmpk_gt_u32 s78, 0 // rMT1 > 0 s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required label_GW_B1_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -12867,28 +12821,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_0 // Branch if true label_To_Activation_None_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Gelu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Relu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Silu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_2 +label_To_Activation_Clamp_VW8_beta_1_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_ActivationSetPCAddrEnd_2: @@ -12905,8 +12867,8 @@ label_ActivationSetPCAddrEnd_2: /* (d1,vc1,d0,vc0)=(0,0,0,0) */ v_add_lshl_u32 v18, v2, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v0, s74 +s_mul_i32 s68, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v0, s68 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -12915,33 +12877,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,1,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,2,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,3,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,4,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,5,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,6,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_add_lshl_u32 v17, v3, v0, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0 @@ -13062,7 +13024,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13098,7 +13060,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13107,8 +13069,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13137,7 +13099,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13146,8 +13108,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13176,7 +13138,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -13185,8 +13147,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13215,7 +13177,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -13224,8 +13186,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13254,7 +13216,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -13263,8 +13225,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13293,7 +13255,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -13302,8 +13264,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -13316,8 +13278,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,7,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -13325,33 +13287,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,8,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,9,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,10,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,11,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,12,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,13,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc224 // copy acc to vreg[56] @@ -13471,7 +13433,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13480,8 +13442,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13510,7 +13472,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13519,8 +13481,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13549,7 +13511,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13558,8 +13520,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13588,7 +13550,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -13597,8 +13559,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13627,7 +13589,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -13636,8 +13598,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13666,7 +13628,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -13675,8 +13637,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13705,7 +13667,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -13714,8 +13676,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -13728,8 +13690,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,14,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -13737,33 +13699,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,15,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,16,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,17,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,18,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,19,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,20,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc193 // copy acc to vreg[112] @@ -13883,7 +13845,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -13892,8 +13854,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13922,7 +13884,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -13931,8 +13893,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13961,7 +13923,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -13970,8 +13932,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14000,7 +13962,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14009,8 +13971,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14039,7 +14001,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -14048,8 +14010,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14078,7 +14040,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -14087,8 +14049,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14117,7 +14079,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -14126,8 +14088,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14140,8 +14102,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,21,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[80:83], v19 offset:0 // load Bias @@ -14149,33 +14111,33 @@ ds_read_b128 v[84:87], v19 offset:16 // load Bias ds_read_b128 v[88:91], v19 offset:1024 // load scaleAlpha ds_read_b128 v[92:95], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,22,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,23,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,24,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,25,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,26,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,27,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc162 // copy acc to vreg[168] @@ -14295,7 +14257,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -14304,8 +14266,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14334,7 +14296,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -14343,8 +14305,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14373,7 +14335,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -14382,8 +14344,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14412,7 +14374,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14421,8 +14383,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14451,7 +14413,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -14460,8 +14422,8 @@ v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14490,7 +14452,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -14499,8 +14461,8 @@ v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14529,7 +14491,7 @@ v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[72:73], v[4:5] v_mov_b64 v[74:75], v[6:7] v_mov_b64 v[76:77], v[8:9] @@ -14538,8 +14500,8 @@ v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14552,8 +14514,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,28,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[56:59], v19 offset:0 // load Bias @@ -14561,18 +14523,18 @@ ds_read_b128 v[60:63], v19 offset:16 // load Bias ds_read_b128 v[64:67], v19 offset:1024 // load scaleAlpha ds_read_b128 v[68:71], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,29,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[72:75], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,30,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[76:79], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,31,0,0) */ -s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[80:83], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc131 // copy acc to vreg[224] @@ -14656,7 +14618,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -14665,8 +14627,8 @@ v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14695,7 +14657,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -14704,8 +14666,8 @@ v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14734,7 +14696,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -14743,8 +14705,8 @@ v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14773,7 +14735,7 @@ v_pk_add_f32 v[4:5], v[56:57], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[58:59], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[60:61], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[62:63], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -14782,8 +14744,8 @@ v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor -s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s68, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14795,28 +14757,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_1 // Branch if true label_To_Activation_None_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Gelu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Relu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Silu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_1 +label_To_Activation_Clamp_VW8_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_ActivationSetPCAddrEnd_1: @@ -14832,14 +14802,14 @@ label_ActivationSetPCAddrEnd_1: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -14848,92 +14818,92 @@ ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -15038,7 +15008,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15072,7 +15042,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15106,7 +15076,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15140,7 +15110,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15174,7 +15144,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -15208,7 +15178,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -15234,106 +15204,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] v_accvgpr_read_b32 v[vgprValuC+25], acc196 // copy acc to vreg[49] v_accvgpr_read_b32 v[vgprValuC+26], acc200 // copy acc to vreg[50] @@ -15438,7 +15408,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15472,7 +15442,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15506,7 +15476,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15540,7 +15510,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15574,7 +15544,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -15608,7 +15578,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -15634,106 +15604,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -15838,7 +15808,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -15872,7 +15842,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -15906,7 +15876,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -15940,7 +15910,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -15974,7 +15944,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16008,7 +15978,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16034,106 +16004,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] @@ -16238,7 +16208,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16272,7 +16242,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16306,7 +16276,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -16340,7 +16310,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -16374,7 +16344,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16408,7 +16378,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16434,106 +16404,106 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v0, s78 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v0, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v0, s78 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v0, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v0, s78 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -16638,7 +16608,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16672,7 +16642,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16706,7 +16676,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[40:41], v[4:5] v_mov_b64 v[42:43], v[6:7] v_mov_b64 v[44:45], v[8:9] @@ -16740,7 +16710,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[48:49], v[4:5] v_mov_b64 v[50:51], v[6:7] v_mov_b64 v[52:53], v[8:9] @@ -16774,7 +16744,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[56:57], v[4:5] v_mov_b64 v[58:59], v[6:7] v_mov_b64 v[60:61], v[8:9] @@ -16808,7 +16778,7 @@ v_pk_add_f32 v[4:5], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[6:7], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[8:9], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[10:11], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[64:65], v[4:5] v_mov_b64 v[66:67], v[6:7] v_mov_b64 v[68:69], v[8:9] @@ -16834,38 +16804,38 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v17, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v0, s78 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[40:43], v18 offset:0 // load Bias ds_read_b128 v[44:47], v18 offset:16 // load Bias ds_read_b128 v[48:51], v18 offset:1024 // load scaleAlpha ds_read_b128 v[52:55], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v12, v17, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v17, v12, v17, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v19, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDC clip if OOB. offset buffer_load_dwordx4 v[56:59], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v60, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v60, v0, s78 v_lshlrev_b32 v60, 0x2, v60 // Bias address scaled by BPE v_add_lshl_u32 v19, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v12, v19, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v19, v12, v19, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] v_accvgpr_read_b32 v[vgprValuC+25], acc199 // copy acc to vreg[241] v_accvgpr_read_b32 v[vgprValuC+26], acc203 // copy acc to vreg[242] @@ -16922,7 +16892,7 @@ v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[8:9], v[44:45], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[10:11], v[46:47], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[24:25], v[4:5] v_mov_b64 v[26:27], v[6:7] v_mov_b64 v[28:29], v[8:9] @@ -16956,7 +16926,7 @@ v_pk_add_f32 v[4:5], v[40:41], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[6:7], v[42:43], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[8:9], v[44:45], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[10:11], v[46:47], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b64 v[32:33], v[4:5] v_mov_b64 v[34:35], v[6:7] v_mov_b64 v[36:37], v[8:9] @@ -16975,28 +16945,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_1_edge_1 // Branch if true label_To_Activation_None_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Gelu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Relu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Silu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s74, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s74 // add target branch offset +s_add_i32 s65, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd +label_To_Activation_Clamp_VW1_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s65, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s65 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_ActivationSetPCAddrEnd: @@ -17012,532 +16990,532 @@ label_ActivationSetPCAddrEnd: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v0, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v0, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v0, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -17608,7 +17586,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17616,7 +17594,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17624,7 +17602,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17632,7 +17610,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17640,7 +17618,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17648,7 +17626,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17656,7 +17634,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17664,7 +17642,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17672,7 +17650,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17680,7 +17658,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17688,7 +17666,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17696,7 +17674,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17704,7 +17682,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17712,7 +17690,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17720,7 +17698,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17728,7 +17706,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17736,7 +17714,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17744,7 +17722,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17752,7 +17730,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17760,7 +17738,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17768,7 +17746,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17776,7 +17754,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17784,7 +17762,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17792,7 +17770,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17800,7 +17778,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17808,7 +17786,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17816,7 +17794,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17824,7 +17802,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17832,7 +17810,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17840,7 +17818,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17848,7 +17826,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17856,7 +17834,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17864,7 +17842,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17872,7 +17850,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17880,7 +17858,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17888,7 +17866,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17896,7 +17874,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17904,7 +17882,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -17920,534 +17898,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v0, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc152 // copy acc to vreg[38] v_accvgpr_read_b32 v[vgprValuC+18], acc156 // copy acc to vreg[39] v_accvgpr_read_b32 v[vgprValuC+19], acc160 // copy acc to vreg[40] @@ -18518,7 +18496,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18526,7 +18504,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18534,7 +18512,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18542,7 +18520,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18550,7 +18528,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18558,7 +18536,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18566,7 +18544,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18574,7 +18552,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18582,7 +18560,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18590,7 +18568,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18598,7 +18576,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18606,7 +18584,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18614,7 +18592,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18622,7 +18600,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18630,7 +18608,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18638,7 +18616,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18646,7 +18624,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18654,7 +18632,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18662,7 +18640,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18670,7 +18648,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18678,7 +18656,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18686,7 +18664,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18694,7 +18672,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18702,7 +18680,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18710,7 +18688,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18718,7 +18696,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18726,7 +18704,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18734,7 +18712,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18742,7 +18720,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18750,7 +18728,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18758,7 +18736,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18766,7 +18744,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18774,7 +18752,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18782,7 +18760,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18790,7 +18768,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18798,7 +18776,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18806,7 +18784,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18814,7 +18792,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18830,534 +18808,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v0, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v0, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v0, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v0, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v0, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc49 // copy acc to vreg[76] v_accvgpr_read_b32 v[vgprValuC+18], acc53 // copy acc to vreg[77] v_accvgpr_read_b32 v[vgprValuC+19], acc57 // copy acc to vreg[78] @@ -19428,7 +19406,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19436,7 +19414,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19444,7 +19422,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19452,7 +19430,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19460,7 +19438,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19468,7 +19446,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19476,7 +19454,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19484,7 +19462,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19492,7 +19470,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19500,7 +19478,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19508,7 +19486,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19516,7 +19494,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19524,7 +19502,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19532,7 +19510,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19540,7 +19518,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19548,7 +19526,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19556,7 +19534,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19564,7 +19542,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19572,7 +19550,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19580,7 +19558,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19588,7 +19566,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19596,7 +19574,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19604,7 +19582,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19612,7 +19590,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19620,7 +19598,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19628,7 +19606,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19636,7 +19614,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19644,7 +19622,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19652,7 +19630,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19660,7 +19638,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19668,7 +19646,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19676,7 +19654,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19684,7 +19662,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19692,7 +19670,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19700,7 +19678,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19708,7 +19686,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19716,7 +19694,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19724,7 +19702,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19740,530 +19718,530 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v0, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v0, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v0, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v0, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc201 // copy acc to vreg[114] v_accvgpr_read_b32 v[vgprValuC+18], acc205 // copy acc to vreg[115] v_accvgpr_read_b32 v[vgprValuC+19], acc209 // copy acc to vreg[116] @@ -20334,7 +20312,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20342,7 +20320,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20350,7 +20328,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20358,7 +20336,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20366,7 +20344,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20374,7 +20352,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20382,7 +20360,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20390,7 +20368,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20398,7 +20376,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20406,7 +20384,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20414,7 +20392,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20422,7 +20400,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20430,7 +20408,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20438,7 +20416,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20446,7 +20424,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20454,7 +20432,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20462,7 +20440,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20470,7 +20448,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20478,7 +20456,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20486,7 +20464,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20494,7 +20472,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20502,7 +20480,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20510,7 +20488,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20518,7 +20496,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20526,7 +20504,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20534,7 +20512,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20542,7 +20520,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20550,7 +20528,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20558,7 +20536,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20566,7 +20544,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20574,7 +20552,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20582,7 +20560,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20590,7 +20568,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20598,7 +20576,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20606,7 +20584,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20614,7 +20592,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20622,7 +20600,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20630,7 +20608,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20650,530 +20628,530 @@ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v0, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v0, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v0, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v0, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v0, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc98 // copy acc to vreg[152] v_accvgpr_read_b32 v[vgprValuC+18], acc102 // copy acc to vreg[153] v_accvgpr_read_b32 v[vgprValuC+19], acc106 // copy acc to vreg[154] @@ -21244,7 +21222,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21252,7 +21230,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21260,7 +21238,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21268,7 +21246,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21276,7 +21254,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21284,7 +21262,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21292,7 +21270,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21300,7 +21278,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21308,7 +21286,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21316,7 +21294,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21324,7 +21302,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21332,7 +21310,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21340,7 +21318,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21348,7 +21326,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21356,7 +21334,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21364,7 +21342,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21372,7 +21350,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21380,7 +21358,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21388,7 +21366,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21396,7 +21374,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21404,7 +21382,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21412,7 +21390,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21420,7 +21398,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21428,7 +21406,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21436,7 +21414,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21444,7 +21422,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21452,7 +21430,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21460,7 +21438,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21468,7 +21446,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21476,7 +21454,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21484,7 +21462,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21492,7 +21470,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21500,7 +21478,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21508,7 +21486,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21516,7 +21494,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21524,7 +21502,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21532,7 +21510,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21540,7 +21518,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21556,534 +21534,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v88, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s78 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v12, v88, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v88, v12, v88, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v93, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s78 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v12, v93, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v93, v12, v93, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v96, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s78 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v12, v96, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v96, v12, v96, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v99, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v4, s78 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v12, v99, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v99, v12, v99, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v102, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v0, s78 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v12, v102, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v102, v12, v102, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v105, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s78 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v12, v105, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v105, v12, v105, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v108, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s78 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v12, v108, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v108, v12, v108, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v111, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s78 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v12, v111, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v111, v12, v111, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v114, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s78 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v12, v114, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v114, v12, v114, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v117, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s78 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v12, v117, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v117, v12, v117, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v120, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s78 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v12, v120, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v120, v12, v120, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v123, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v4, s78 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v12, v123, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v123, v12, v123, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v126, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v0, s78 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v12, v126, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v126, v12, v126, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v129, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v4, s78 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v12, v129, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v129, v12, v129, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v135, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s78 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v12, v135, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v135, v12, v135, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v138, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v4, s78 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v12, v138, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v138, v12, v138, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v141, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s78 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v12, v141, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v141, v12, v141, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v144, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s78 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v12, v144, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v144, v12, v144, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v147, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s78 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v12, v147, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v147, v12, v147, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v150, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v4, s78 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v12, v150, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v150, v12, v150, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v153, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v0, s78 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v12, v153, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v153, v12, v153, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v156, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v4, s78 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v12, v156, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v156, v12, v156, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v159, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s78 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v12, v159, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v159, v12, v159, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v162, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v4, s78 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v12, v162, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v162, v12, v162, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v165, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s78 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v12, v165, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v165, v12, v165, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v168, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v4, s78 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v12, v168, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v168, v12, v168, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v171, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s78 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v12, v171, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v171, v12, v171, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v174, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v4, s78 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v12, v174, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v174, v12, v174, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v177, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v0, s78 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v12, v177, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v177, v12, v177, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v180, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v4, s78 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v12, v180, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v180, v12, v180, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v183, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s78 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v12, v183, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v183, v12, v183, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v186, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v4, s78 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v12, v186, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v186, v12, v186, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc250 // copy acc to vreg[190] v_accvgpr_read_b32 v[vgprValuC+18], acc254 // copy acc to vreg[191] v_accvgpr_read_b32 v[vgprValuC+19], acc3 // copy acc to vreg[192] @@ -22154,7 +22132,7 @@ v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22162,7 +22140,7 @@ v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22170,7 +22148,7 @@ v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22178,7 +22156,7 @@ v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22186,7 +22164,7 @@ v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22194,7 +22172,7 @@ v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22202,7 +22180,7 @@ v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22210,7 +22188,7 @@ v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v90 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22218,7 +22196,7 @@ v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v95 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22226,7 +22204,7 @@ v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22234,7 +22212,7 @@ v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22242,7 +22220,7 @@ v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22250,7 +22228,7 @@ v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22258,7 +22236,7 @@ v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22266,7 +22244,7 @@ v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22274,7 +22252,7 @@ v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22282,7 +22260,7 @@ v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22290,7 +22268,7 @@ v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v122 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22298,7 +22276,7 @@ v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v125 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22306,7 +22284,7 @@ v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v128 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22314,7 +22292,7 @@ v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v131 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22322,7 +22300,7 @@ v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v137 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22330,7 +22308,7 @@ v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v140 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22338,7 +22316,7 @@ v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v143 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22346,7 +22324,7 @@ v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v146 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22354,7 +22332,7 @@ v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v149 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22362,7 +22340,7 @@ v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v152 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22370,7 +22348,7 @@ v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v155 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22378,7 +22356,7 @@ v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v158 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v45, v4 v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22386,7 +22364,7 @@ v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v161 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v46, v4 v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22394,7 +22372,7 @@ v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v164 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v47, v4 v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22402,7 +22380,7 @@ v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v167 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v48, v4 v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22410,7 +22388,7 @@ v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v170 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v49, v4 v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22418,7 +22396,7 @@ v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v173 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v50, v4 v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22426,7 +22404,7 @@ v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v176 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v51, v4 v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22434,7 +22412,7 @@ v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v179 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v52, v4 v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22442,7 +22420,7 @@ v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v182 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v53, v4 v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22450,7 +22428,7 @@ v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v185 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v54, v4 v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22466,396 +22444,396 @@ s_nop 0 // 1 wait state required when v_mov_b32 v12, BufferOOB /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v48, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v12, v48, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v48, v12, v48, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v45, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v49, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v49, v4, s78 v_lshlrev_b32 v49, 0x2, v49 // Bias address scaled by BPE ds_read_b32 v46, v49 offset:0 // load Bias ds_read_b32 v47, v49 offset:1024 // load scaleAlpha v_add_lshl_u32 v48, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v12, v48, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v48, v12, v48, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v53, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v12, v53, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v53, v12, v53, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v50, v53, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v54, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v54, v4, s78 v_lshlrev_b32 v54, 0x2, v54 // Bias address scaled by BPE ds_read_b32 v51, v54 offset:0 // load Bias ds_read_b32 v52, v54 offset:1024 // load scaleAlpha v_add_lshl_u32 v53, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v12, v53, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v53, v12, v53, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v58, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s78 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v12, v58, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v58, v12, v58, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v63, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s78 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v12, v63, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v63, v12, v63, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v68, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v0, s78 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v12, v68, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v68, v12, v68, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v73, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v4, s78 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v12, v73, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v73, v12, v73, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v78, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s78 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v12, v78, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v78, v12, v78, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v83, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v4, s78 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v12, v83, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v83, v12, v83, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v86, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v85, v86, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v87, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v87, v4, s78 v_lshlrev_b32 v87, 0x2, v87 // Bias address scaled by BPE v_add_lshl_u32 v86, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v12, v86, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v86, v12, v86, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v89, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v12, v89, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v89, v12, v89, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v88, v89, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v90, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v90, v4, s78 v_lshlrev_b32 v90, 0x2, v90 // Bias address scaled by BPE v_add_lshl_u32 v89, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v12, v89, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v89, v12, v89, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v92, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v4, s78 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE v_add_lshl_u32 v92, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v12, v92, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v92, v12, v92, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v95, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v94, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v4, s78 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v12, v95, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v95, v12, v95, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v98, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v0, s78 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v12, v98, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v98, v12, v98, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v101, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v12, v101, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v101, v12, v101, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v4, s78 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v12, v101, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v101, v12, v101, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v104, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v4, s78 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v12, v104, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v104, v12, v104, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v107, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s78 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v12, v107, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v107, v12, v107, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v110, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v4, s78 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v12, v110, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v110, v12, v110, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v113, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v12, v113, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v113, v12, v113, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v4, s78 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v12, v113, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v113, v12, v113, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v116, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v4, s78 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v12, v116, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v116, v12, v116, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v119, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v12, v119, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v119, v12, v119, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v4, s78 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v12, v119, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v119, v12, v119, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v1, vcc, v1, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v2, v2, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v3, v3, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[86:87], v0, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v122, v2, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v0, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v0, s78 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v3, v0, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v12, v122, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v122, v12, v122, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v4, vcc, v0, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v125, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v12, v125, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v125, v12, v125, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s78 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v12, v125, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v125, v12, v125, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v4, vcc, v0, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v128, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v4, s78 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v12, v128, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v128, v12, v128, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v4, vcc, v0, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v131, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v12, v131, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v131, v12, v131, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v4, s78 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v12, v131, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v131, v12, v131, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v4, vcc, v0, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v137, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v4, s78 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v12, v137, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v137, v12, v137, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v4, vcc, v0, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v140, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v12, v140, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v140, v12, v140, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v4, s78 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v12, v140, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v140, v12, v140, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v4, vcc, v0, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v143, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v4, s78 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v12, v143, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v143, v12, v143, s[82:83] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v4, vcc, v0, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[86:87], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[90:91], v1, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[90:91], s[86:87], s[90:91] // in0 && in1 +v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[82:83], s[78:79], s[82:83] // in0 && in1 v_add_lshl_u32 v146, v2, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v12, v146, s[90:91] // LDC clip if OOB. offset +v_cndmask_b32 v146, v12, v146, s[82:83] // LDC clip if OOB. offset buffer_load_short_d16 v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s86, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v4, s86 +s_mul_i32 s78, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v4, s78 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v3, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v12, v146, s[90:91] // LDD clip if OOB. offset +v_cndmask_b32 v146, v12, v146, s[82:83] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc147 // copy acc to vreg[228] v_accvgpr_read_b32 v[vgprValuC+18], acc151 // copy acc to vreg[229] v_accvgpr_read_b32 v[vgprValuC+19], acc155 // copy acc to vreg[230] @@ -22911,7 +22889,7 @@ v_mul_f32 v[vgprValuC+17], v47, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v45 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+17], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v17, v4 v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1 buffer_store_short v17, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22919,7 +22897,7 @@ v_mul_f32 v[vgprValuC+18], v52, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v50 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+18], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v18, v4 v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1 buffer_store_short v18, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22927,7 +22905,7 @@ v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v55 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+19], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v19, v4 v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1 buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22935,7 +22913,7 @@ v_mul_f32 v[vgprValuC+20], v62, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v60 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+20], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v20, v4 v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1 buffer_store_short v20, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22943,7 +22921,7 @@ v_mul_f32 v[vgprValuC+21], v67, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v65 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+21], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v21, v4 v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1 buffer_store_short v21, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22951,7 +22929,7 @@ v_mul_f32 v[vgprValuC+22], v72, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v70 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+22], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v22, v4 v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1 buffer_store_short v22, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22959,7 +22937,7 @@ v_mul_f32 v[vgprValuC+23], v77, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v75 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+23], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v23, v4 v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1 buffer_store_short v23, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22967,7 +22945,7 @@ v_mul_f32 v[vgprValuC+24], v82, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v80 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v24, v4 v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1 buffer_store_short v24, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22975,7 +22953,7 @@ v_mul_f32 v[vgprValuC+25], v47, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v85 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v25, v4 v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1 buffer_store_short v25, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22983,7 +22961,7 @@ v_mul_f32 v[vgprValuC+26], v52, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v88 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v26, v4 v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1 buffer_store_short v26, v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22991,7 +22969,7 @@ v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v91 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v27, v4 v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1 buffer_store_short v27, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22999,7 +22977,7 @@ v_mul_f32 v[vgprValuC+28], v62, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v94 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v28, v4 v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23007,7 +22985,7 @@ v_mul_f32 v[vgprValuC+29], v67, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v97 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v29, v4 v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23015,7 +22993,7 @@ v_mul_f32 v[vgprValuC+30], v72, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v100 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v30, v4 v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23023,7 +23001,7 @@ v_mul_f32 v[vgprValuC+31], v77, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v103 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v31, v4 v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23031,7 +23009,7 @@ v_mul_f32 v[vgprValuC+32], v82, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v106 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v32, v4 v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23039,7 +23017,7 @@ v_mul_f32 v[vgprValuC+33], v47, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v109 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v33, v4 v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23047,7 +23025,7 @@ v_mul_f32 v[vgprValuC+34], v52, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v112 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v34, v4 v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23055,7 +23033,7 @@ v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v115 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v35, v4 v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23063,7 +23041,7 @@ v_mul_f32 v[vgprValuC+36], v62, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v118 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v36, v4 v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23071,7 +23049,7 @@ v_mul_f32 v[vgprValuC+37], v67, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v121 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v66, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v37, v4 v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23079,7 +23057,7 @@ v_mul_f32 v[vgprValuC+38], v72, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v124 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v71, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v38, v4 v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23087,7 +23065,7 @@ v_mul_f32 v[vgprValuC+39], v77, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v127 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v76, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v39, v4 v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23095,7 +23073,7 @@ v_mul_f32 v[vgprValuC+40], v82, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v130 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v81, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v40, v4 v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23103,7 +23081,7 @@ v_mul_f32 v[vgprValuC+41], v47, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v136 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v46, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v41, v4 v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23111,7 +23089,7 @@ v_mul_f32 v[vgprValuC+42], v52, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v139 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v51, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v42, v4 v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23119,7 +23097,7 @@ v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v142 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v56, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v43, v4 v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23127,14 +23105,14 @@ v_mul_f32 v[vgprValuC+44], v62, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_cvt_f32_bf16 v4, v145 src0_sel:WORD_0 // cvt bf16 to f32 v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta] // finalSum = sum*alpha + C*beta v_add_f32 v4, v61, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[66:67], s[8:9] v_mov_b32 v44, v4 v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_Activation_None_VW8: -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Gelu_VW8: v_mul_f32 v12, 0x3d372713, v4 // k1 * x v_fma_f32 v12, v4, v12, 1.0 // 1 + (k1 * x * x) @@ -23232,7 +23210,7 @@ s_nop 0 // 1 wait states v_fma_f32 v12, -2.0, v12, 2.0 // ( + 1 (fused)) v_mul_f32 v12, v11, v12 // x * (1 + tanh(...)) v_mul_f32 v11, 0.5, v12 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Relu_VW8: v_max_f32 v4, v4, 0 // x = max(0, x) v_max_f32 v5, v5, 0 // x = max(0, x) @@ -23242,7 +23220,7 @@ v_max_f32 v8, v8, 0 // x = max(0, x) v_max_f32 v9, v9, 0 // x = max(0, x) v_max_f32 v10, v10, 0 // x = max(0, x) v_max_f32 v11, v11, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Silu_VW8: v_mul_f32 v12, -1.4426950408889634, v4 // (fused -1.442695) v_exp_f32 v12, v12 // exp step 2 @@ -23300,9 +23278,27 @@ v_add_f32 v12, 1.0, v12 // 1 + exp(-x) v_rcp_f32 v12, v12 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v11, v11, v12 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] +label_Activation_Clamp_VW8: +v_min_f32 v4, s[sgpractivationBeta], v4 // min(x, beta) +v_max_f32 v4, s[sgpractivationAlpha], v4 // max(alpha, min(x, beta)) +v_min_f32 v5, s[sgpractivationBeta], v5 // min(x, beta) +v_max_f32 v5, s[sgpractivationAlpha], v5 // max(alpha, min(x, beta)) +v_min_f32 v6, s[sgpractivationBeta], v6 // min(x, beta) +v_max_f32 v6, s[sgpractivationAlpha], v6 // max(alpha, min(x, beta)) +v_min_f32 v7, s[sgpractivationBeta], v7 // min(x, beta) +v_max_f32 v7, s[sgpractivationAlpha], v7 // max(alpha, min(x, beta)) +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +v_min_f32 v9, s[sgpractivationBeta], v9 // min(x, beta) +v_max_f32 v9, s[sgpractivationAlpha], v9 // max(alpha, min(x, beta)) +v_min_f32 v10, s[sgpractivationBeta], v10 // min(x, beta) +v_max_f32 v10, s[sgpractivationAlpha], v10 // max(alpha, min(x, beta)) +v_min_f32 v11, s[sgpractivationBeta], v11 // min(x, beta) +v_max_f32 v11, s[sgpractivationAlpha], v11 // max(alpha, min(x, beta)) +s_setpc_b64 s[66:67] label_Activation_None_VW1: -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Gelu_VW1: v_mul_f32 v12, 0x3d372713, v4 // k1 * x v_fma_f32 v12, v4, v12, 1.0 // 1 + (k1 * x * x) @@ -23316,10 +23312,10 @@ s_nop 0 // 1 wait states v_fma_f32 v12, -2.0, v12, 2.0 // ( + 1 (fused)) v_mul_f32 v12, v4, v12 // x * (1 + tanh(...)) v_mul_f32 v4, 0.5, v12 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Relu_VW1: v_max_f32 v4, v4, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] label_Activation_Silu_VW1: v_mul_f32 v12, -1.4426950408889634, v4 // (fused -1.442695) v_exp_f32 v12, v12 // exp step 2 @@ -23328,7 +23324,11 @@ v_add_f32 v12, 1.0, v12 // 1 + exp(-x) v_rcp_f32 v12, v12 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v4, v4, v12 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[66:67] +label_Activation_Clamp_VW1: +v_min_f32 v4, s[sgpractivationBeta], v4 // min(x, beta) +v_max_f32 v4, s[sgpractivationAlpha], v4 // max(alpha, min(x, beta)) +s_setpc_b64 s[66:67] label_SK_Partials: label_GW_Partials_E0: s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address @@ -23726,24 +23726,24 @@ s_nop 0 // 1 wait state required when s_waitcnt vmcnt(0) // wait for data store s_barrier // store all data before setting flag s_lshl_b32 s8, s[sgprStreamKIdx], 2 // flag offset based on CTA index -v_readfirstlane_b32 s72, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s72, 0 // Check for wave 0 +v_readfirstlane_b32 s65, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s65, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagSet // Skip flag set -s_mov_b32 s72, 1 // flag data -s_store_dword s72, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag +s_mov_b32 s65, 1 // flag data +s_store_dword s65, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag label_SK_SkipFlagSet: s_waitcnt lgkmcnt(0) // wait for flag s_branch label_GW_End // jump to end label_GW_End: -s_mov_b32 s[sgprAlpha], s67 // Restore alpha value +s_mov_b32 s[sgprAlpha], s64 // Restore alpha value s_cmp_ge_u32 s[sgprStreamKIter], s[sgprStreamKIterEnd] // Check if done all StreamK iterations s_cbranch_scc1 label_NoBranch_Y57Y54XUE2DV604X // Only branch on scc0 -s_getpc_b64 s[72:73] // addr of next instr -s_add_i32 s74, label_PersistentLoopStart, 4 // target branch offset -s_abs_i32 s74, s74 // abs offset -s_sub_u32 s72, s72, s74 // sub target branch offset -s_subb_u32 s73, s73, 0 // sub high and carry -s_setpc_b64 s[72:73] // branch to label_PersistentLoopStart +s_getpc_b64 s[64:65] // addr of next instr +s_add_i32 s66, label_PersistentLoopStart, 4 // target branch offset +s_abs_i32 s66, s66 // abs offset +s_sub_u32 s64, s64, s66 // sub target branch offset +s_subb_u32 s65, s65, 0 // sub high and carry +s_setpc_b64 s[64:65] // branch to label_PersistentLoopStart label_NoBranch_Y57Y54XUE2DV604X: label_KernelEnd: s_endpgm // Kernel End diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s index ad6850d022bb..165dcf42b287 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s @@ -30,7 +30,7 @@ .text /* Num VGPR =249 */ /* Num AccVGPR=256 */ -/* Num SGPR =111 */ +/* Num SGPR =105 */ /******************************************/ /* Optimizations and Config: */ @@ -229,106 +229,71 @@ amdhsa.kernels: .offset: 116 .value_kind: by_value .value_type: f32 - - .name: MagicNumberProblemNumGroupTiles0 + - .name: ItersPerTile .size: 4 .offset: 120 .value_kind: by_value .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0 + - .name: TotalIters .size: 4 .offset: 124 .value_kind: by_value .value_type: u32 - - .name: ItersPerTile + - .name: SKItersPerWG .size: 4 .offset: 128 .value_kind: by_value .value_type: u32 - - .name: MagicNumberItersPerTile + - .name: skGridAndTiles .size: 4 .offset: 132 .value_kind: by_value .value_type: u32 - - .name: MagicShiftItersPerTile - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumProblemNumGroupTiles0By1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: MagicShiftProblemNumGroupTiles0By1 - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - - .name: TotalIters - .size: 4 - .offset: 148 - .value_kind: by_value - .value_type: u32 - - .name: SKItersPerWG - .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: skGrid - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: skTiles - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - .name: skExtraIters .size: 4 - .offset: 164 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: AddressScaleAlphaVec .size: 8 - .offset: 168 + .offset: 140 .value_kind: global_buffer .value_type: f32 .address_space: generic - .name: bias .size: 8 - .offset: 176 + .offset: 148 .value_kind: global_buffer .value_type: void .address_space: generic - .name: biasType .size: 4 - .offset: 184 + .offset: 156 .value_kind: by_value .value_type: u32 - .name: StrideBias .size: 4 - .offset: 188 + .offset: 160 .value_kind: by_value .value_type: u32 - .name: activationAlpha .size: 4 - .offset: 192 + .offset: 164 .value_kind: by_value .value_type: f32 - .name: activationBeta .size: 4 - .offset: 196 + .offset: 168 .value_kind: by_value .value_type: f32 - .name: activationType .size: 4 - .offset: 200 + .offset: 172 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 133120 .kernarg_segment_align: 8 - .kernarg_segment_size: 208 + .kernarg_segment_size: 176 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 102 @@ -339,7 +304,6 @@ amdhsa.kernels: ... .end_amdgpu_metadata Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: -label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA @@ -402,28 +366,21 @@ label_ASM_Start: /// Main body of the asm kernel .set sgprStridesB, 42 .set sgprAlpha, 44 .set sgprBeta, 45 -.set sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgprItersPerTile, 48 -.set sgprMagicNumberItersPerTile, 49 -.set sgprMagicShiftItersPerTile, 50 -.set sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgprMagicShiftProblemNumGroupTiles0By1, 52 -.set sgprTotalIters, 53 -.set sgprSKItersPerWG, 54 -.set sgprskGrid, 55 -.set sgprskTiles, 56 -.set sgprskExtraIters, 57 -.set sgprLocalWriteAddrA, 58 -.set sgprLocalWriteAddrB, 59 -.set sgprSwapA, 60 -.set sgprSwapB, 61 -.set sgprStreamKIdx, 62 -.set sgprStreamKIter, 63 -.set sgprStreamKIterEnd, 64 -.set sgprStreamKLocalStart, 65 -.set sgprStreamKLocalEnd, 66 -.set sgprSrdWS, 68 +.set sgprItersPerTile, 46 +.set sgprTotalIters, 47 +.set sgprSKItersPerWG, 48 +.set sgprskGridAndTiles, 49 +.set sgprskExtraIters, 50 +.set sgprLocalWriteAddrA, 51 +.set sgprLocalWriteAddrB, 52 +.set sgprSwapA, 53 +.set sgprSwapB, 54 +.set sgprStreamKIdx, 55 +.set sgprStreamKIter, 56 +.set sgprStreamKIterEnd, 57 +.set sgprStreamKLocalStart, 58 +.set sgprStreamKLocalEnd, 59 +.set sgprSrdWS, 60 /* Size Assignments */ .set sgprSizeI, sgprSizesFree+0 @@ -504,29 +461,30 @@ label_ASM_Start: /// Main body of the asm kernel /******************************************/ /* Load num of Gemms */ -s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 +s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0 /* Load packed kernel args (StaggerU/GSU) */ -s_load_dword s73, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 +s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4 /* Load WGM data */ s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8 /* Load num of WGs */ -s_load_dword s74, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 +s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12 s_waitcnt lgkmcnt(0) // load args -s_lshr_b32 s72, s67, 0x1e // Get arg type -s_and_b32 s67, 0x3fffffff, s67 // Get nums of gemm -s_cmp_eq_u32 s72, 0 // Is kernel args +s_lshr_b32 s65, s64, 0x1e // Get arg type +s_and_b32 s64, 0x3fffffff, s64 // Get nums of gemm +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0 -s_load_dwordx16 s[36:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_waitcnt lgkmcnt(0) // preload s_branch label_LoadArgsEnd label_HBMArgs: @@ -537,9 +495,7 @@ s_waitcnt lgkmcnt(0) // wait for args to load label_LoadArgsEnd: s_branch label_common_kernel_entry -/* pad 35 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ -s_nop 0 -s_nop 0 +/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */ s_nop 0 s_nop 0 s_nop 0 @@ -574,10 +530,10 @@ s_nop 0 s_nop 0 s_nop 0 label_Preload_Offset_Start: -s_and_b32 s67, 0x3fffffff, s2 // Get nums of gemm -s_lshr_b32 s72, s2, 0x1e // Get arg type -s_mov_b32 s73, s3 // Preload internal args -s_cmp_eq_u32 s72, 0 // Is kernel args +s_and_b32 s64, 0x3fffffff, s2 // Get nums of gemm +s_lshr_b32 s65, s2, 0x1e // Get arg type +s_mov_b32 s66, s3 // Preload internal args +s_cmp_eq_u32 s65, 0 // Is kernel args s_cbranch_scc0 label_Preload_HBMArgs s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 @@ -585,9 +541,9 @@ s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28 s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32 -s_load_dwordx8 s[44:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 -s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 128 // 128 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_mov_b64 s[20:21], s[6:7] // move preload data to correct sgpr s_mov_b64 s[22:23], s[8:9] // move preload data to correct sgpr s_mov_b64 s[24:25], s[10:11] // move preload data to correct sgpr @@ -597,90 +553,90 @@ label_Preload_HBMArgs: s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments label_Preload_LoadArgsEnd: s_mov_b32 s[sgprWGM], s4 // Preload internal args2 -s_mov_b32 s74, s5 // Load num of WGs +s_mov_b32 s67, s5 // Load num of WGs label_common_kernel_entry: /// for both preload/non-preload common code s_mov_b32 s[sgprWorkGroup0+0], s13 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+1], s14 // restore workgroup id s_mov_b32 s[sgprWorkGroup0+2], s15 // restore workgroup id -s_and_b32 s[sgprStaggerU], s73, 0xffff0000 // Restore StaggerU related vars +s_and_b32 s[sgprStaggerU], s66, 0xffff0000 // Restore StaggerU related vars s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 -s_mov_b32 s[sgprArgType], s72 +s_mov_b32 s[sgprArgType], s65 s_mov_b32 m0, 0x20800 // LDS clamp at 133120 bytes v_mov_b32 v[vgprSerial], v0 // thread serial id /* remap workgroup to XCCs */ -s_lshr_b32 s80, s[sgprWGM], 0x10 // Get WGMXCC -s_ff1_i32_b32 s80, s80 // Get log(WGMXCC) -s_lshr_b32 s81, s[sgprWGM], 0x16 // Get CU_Count +s_lshr_b32 s72, s[sgprWGM], 0x10 // Get WGMXCC +s_ff1_i32_b32 s72, s72 // Get log(WGMXCC) +s_lshr_b32 s73, s[sgprWGM], 0x16 // Get CU_Count /* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */ -s_cmp_gt_i32 s80, 0 +s_cmp_gt_i32 s72, 0 s_cbranch_scc0 label_skip_WGMXCC /* only remap WGs in the range */ -s_lshr_b32 s77, s74, s80 -s_lshl_b32 s77, s77, s80 -s_cmp_ge_u32 s[sgprWorkGroup0], s77 +s_lshr_b32 s69, s67, s72 +s_lshl_b32 s69, s69, s72 +s_cmp_ge_u32 s[sgprWorkGroup0], s69 s_cbranch_scc1 label_skip_WGMXCC -s_cmp_eq_u32 s81, 0 // CU_Count == 0 ? +s_cmp_eq_u32 s73, 0 // CU_Count == 0 ? s_cbranch_scc0 label_XCCG_nonzero -s_lshr_b32 s77, s[sgprWorkGroup0], s80 -s_bfm_b32 s78, s80, 0 -s_and_b32 s78, s[sgprWorkGroup0], s78 -s_lshr_b32 s79, s74, s80 -s_mul_i32 s78, s78, s79 -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_lshr_b32 s69, s[sgprWorkGroup0], s72 +s_bfm_b32 s70, s72, 0 +s_and_b32 s70, s[sgprWorkGroup0], s70 +s_lshr_b32 s71, s67, s72 +s_mul_i32 s70, s70, s71 +s_add_u32 s[sgprWorkGroup0], s69, s70 s_branch label_skip_WGMXCC label_XCCG_nonzero: /* temp0 = (wg//CU_Count)*CU_Count */ -v_cvt_f32_u32 v4, s81 // wg//CU_Count +v_cvt_f32_u32 v4, s73 // wg//CU_Count v_rcp_iflag_f32 v4, v4 // wg//CU_Count v_cvt_f32_u32 v5, s[sgprWorkGroup0] // wg//CU_Count v_mul_f32 v4, v4, v5 // wg//CU_Count v_cvt_u32_f32 v4, v4 // wg//CU_Count -v_mul_u32_u24 v5, v4, s81 // wg//CU_Count +v_mul_u32_u24 v5, v4, s73 // wg//CU_Count v_sub_u32 v5, s[sgprWorkGroup0], v5 // wg//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // wg//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // wg//CU_Count v_add_u32 v4, 1, v4 // wg//CU_Count v_mov_b32 v5, 0 // wg//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s81 // re-calculate remainder +v_mul_u32_u24 v5, v4, s73 // re-calculate remainder v_sub_u32 v5, s[sgprWorkGroup0], v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s77, v4 // quotient -v_readfirstlane_b32 s78, v5 // remainder -s_mul_i32 s77, s77, s81 +v_readfirstlane_b32 s69, v4 // quotient +v_readfirstlane_b32 s70, v5 // remainder +s_mul_i32 s69, s69, s73 /* temp1 = (wg%CU_Count)//WGMXCC */ -s_lshr_b32 s78, s78, s80 +s_lshr_b32 s70, s70, s72 /* temp0 = temp0 + temp1 */ -s_add_u32 s77, s77, s78 +s_add_u32 s69, s69, s70 /* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */ -v_cvt_f32_u32 v4, s81 // WGs//CU_Count +v_cvt_f32_u32 v4, s73 // WGs//CU_Count v_rcp_iflag_f32 v4, v4 // WGs//CU_Count -v_cvt_f32_u32 v5, s74 // WGs//CU_Count +v_cvt_f32_u32 v5, s67 // WGs//CU_Count v_mul_f32 v4, v4, v5 // WGs//CU_Count v_cvt_u32_f32 v4, v4 // WGs//CU_Count -v_mul_u32_u24 v5, v4, s81 // WGs//CU_Count -v_sub_u32 v5, s74, v5 // WGs//CU_Count -v_cmpx_eq_u32 exec, v5, s81 // WGs//CU_Count +v_mul_u32_u24 v5, v4, s73 // WGs//CU_Count +v_sub_u32 v5, s67, v5 // WGs//CU_Count +v_cmpx_eq_u32 exec, v5, s73 // WGs//CU_Count v_add_u32 v4, 1, v4 // WGs//CU_Count s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s81 // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s73 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s78, v4 // quotient -s_mul_i32 s78, s78, s81 -s_sub_u32 s79, s74, s78 -s_cmp_gt_u32 s[sgprWorkGroup0], s78 -s_cselect_b32 s78, s79, s81 -s_lshr_b32 s78, s78, s80 -s_bfm_b32 s79, s80, 0 -s_and_b32 s79, s[sgprWorkGroup0], s79 -s_mul_i32 s78, s78, s79 +v_readfirstlane_b32 s70, v4 // quotient +s_mul_i32 s70, s70, s73 +s_sub_u32 s71, s67, s70 +s_cmp_gt_u32 s[sgprWorkGroup0], s70 +s_cselect_b32 s70, s71, s73 +s_lshr_b32 s70, s70, s72 +s_bfm_b32 s71, s72, 0 +s_and_b32 s71, s[sgprWorkGroup0], s71 +s_mul_i32 s70, s70, s71 /* WorkGroup0 = temp0 + temp1 */ -s_add_u32 s[sgprWorkGroup0], s77, s78 +s_add_u32 s[sgprWorkGroup0], s69, s70 label_skip_WGMXCC: /// skip WGMXCC if no enough WGs to remap -s_cmp_eq_u32 s72, 0 +s_cmp_eq_u32 s65, 0 s_cbranch_scc0 label_MultiGemm /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -710,97 +666,98 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args s_branch label_MultiGemmEnd label_MultiGemm: /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_IsExternalValid // branch if ArgType == 2 -s_mov_b32 s11, 188 -s_mul_i32 s78, s67, 4 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 160 +s_mul_i32 s72, s64, 4 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] s_branch label_IsExternalValidEnd label_IsExternalValid: -s_mov_b32 s11, 244 -s_mov_b32 s78, 0 -s_mov_b64 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1] +s_mov_b32 s11, 216 +s_mov_b32 s72, 0 +s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1] label_IsExternalValidEnd: /* Grouped Gemm:: prefetch 1 arg load */ s_mov_b32 s10, 1 -s_mov_b32 s79, 0 -s_load_dwordx4 s[20:23], s[72:73], s78 -s_cmpk_eq_u32 s67, 1 // if gemm_count is 1? +s_mov_b32 s73, 0 +s_load_dwordx4 s[20:23], s[66:67], s72 +s_cmpk_eq_u32 s64, 1 // if gemm_count is 1? s_cbranch_scc1 label_wgTable_noLoadLoop /* Grouped Gemm:: accumulate numTiles for each gemm */ /* Grouped Gemm:: loop start */ label_Loop_GemmCount: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 -s_cmp_lt_u32 s[sgprWorkGroup0], s79 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 +s_cmp_lt_u32 s[sgprWorkGroup0], s73 s_cbranch_scc1 label_FOUND -s_add_u32 s78, s78, s11 -s_load_dwordx4 s[20:23], s[72:73], s78 +s_add_u32 s72, s72, s11 +s_load_dwordx4 s[20:23], s[66:67], s72 s_add_u32 s10, s10, 1 -s_cmp_lt_u32 s10, s67 +s_cmp_lt_u32 s10, s64 s_cbranch_scc1 label_Loop_GemmCount /* Grouped Gemm:: noLoadLoop */ label_wgTable_noLoadLoop: s_waitcnt lgkmcnt(0) -s_lshr_b32 s76, s20, 8 // s76 = s20 / 256 -s_and_b32 s74, 255, s20 // s74 = s20 % 256 -s_addc_u32 s76, s76, 0 -s_lshr_b32 s77, s21, 8 // s77 = s21 / 256 -s_and_b32 s74, 255, s21 // s74 = s21 % 256 -s_addc_u32 s77, s77, 0 -s_mul_i32 s76, s76, s77 -s_mul_i32 s76, s76, s22 -s_add_u32 s79, s79, s76 +s_lshr_b32 s70, s20, 8 // s70 = s20 / 256 +s_and_b32 s68, 255, s20 // s68 = s20 % 256 +s_addc_u32 s70, s70, 0 +s_lshr_b32 s71, s21, 8 // s71 = s21 / 256 +s_and_b32 s68, 255, s21 // s68 = s21 % 256 +s_addc_u32 s71, s71, 0 +s_mul_i32 s70, s70, s71 +s_mul_i32 s70, s70, s22 +s_add_u32 s73, s73, s70 /* Grouped Gemm:: gemmIndex found */ label_FOUND: -s_sub_u32 s73, s10, 1 -s_sub_u32 s72, s79, s76 -s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s72 +s_sub_u32 s67, s10, 1 +s_sub_u32 s66, s73, s70 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalStruct // branch if ArgType == 2 /* Grouped Gemm: offset argument address to gemm */ /* Grouped Gemm: offset address from wg_table_start to args_start */ -s_lshl2_add_u32 s[sgprKernArgAddress], s67, s[sgprKernArgAddress] +s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress] s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 188 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 160 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 /* Load Kernel Args */ s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 +s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120 s_branch label_LoadExternalStructEnd label_LoadExternalStruct: /* Grouped Gemm: offset address from args_start to gemm_start */ -s_mul_i32 s73, s73, 244 -s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s73 +s_mul_i32 s67, s67, 216 +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67 s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0 s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16 -s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 -s_load_dword s56, s[sgprKernArgAddress:sgprKernArgAddress+1], 144 // 144 +s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80 +s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112 // Read Beta -s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 160 // 160 +s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132 label_LoadExternalStructEnd: /* init: add vgpr [4...136) to pool */ /* init: add vgpr [0...0) to pool */ @@ -830,7 +787,7 @@ v_cmp_ne_u32 vcc, v7, 0 // v4 = ceil(v5 / v6) v_addc_co_u32 v4, vcc, v4, 0, vcc // ceil s_nop 0 // 1 wait states v_readfirstlane_b32 s[sgprNumWorkGroups1], v4 // set back to numWorkGroup1 -s_waitcnt lgkmcnt(0) // wait for 108/0 bytes of kern args +s_waitcnt lgkmcnt(0) // wait for 80/0 bytes of kern args /* Early stop if N(SizeFreeJ) == 0 */ s_cmp_eq_u32 s[sgprSizeJ], 0 @@ -840,26 +797,17 @@ s_endpgm label_NoEarlyStop_N0: label_MultiGemmEnd: -.set sgprSrdA, 72 -.set sgprSrdB, 76 -.set sgprShadowLimitA, 80 -.set sgprShadowLimitB, 82 -.set sgprStaggerUIter, 67 -.set sgprWrapUA, sgprKernArgAddress -.set sgprWrapUB, 84 -.set sgprGlobalReadIncsA, 86 -.set sgprGlobalReadIncsB, 87 -.set sgprScalarGlobalReadOffsetA, 88 -.set sgprScalarGlobalReadOffsetB, 95 - -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - +.set sgprSrdA, 64 +.set sgprSrdB, 68 +.set sgprShadowLimitA, 72 +.set sgprShadowLimitB, 74 +.set sgprStaggerUIter, 76 +.set sgprWrapUA, 77 +.set sgprWrapUB, 79 +.set sgprGlobalReadIncsA, 81 +.set sgprGlobalReadIncsB, 82 +.set sgprScalarGlobalReadOffsetA, 83 +.set sgprScalarGlobalReadOffsetB, 90 s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift @@ -873,28 +821,30 @@ label_AlphaNonZero: s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0] // Save original StreamK index s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do) s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do) -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_cmp_lt_u32 s[sgpr104], s[sgprTotalIters] // Check if there are DP tiles to do +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_cmp_lt_u32 s97, s[sgprTotalIters] // Check if there are DP tiles to do s_cbranch_scc1 label_SK_InitDone // Done init s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr105], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr104], s[sgprStreamKIdx], s[sgpr105] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr105], s[sgpr104], s[sgpr105] // StreamK ending iteration (case: before extra iters) +s_add_u32 s98, s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s97, s[sgprStreamKIdx], s98 // StreamK starting iteration (case: before extra iters) +s_add_u32 s98, s97, s98 // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr104], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr105], s[sgprStreamKIterEnd] // Set end iter -s_mul_i32 s[sgpr104], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr104] // Cap ending iter at total SK iters +s_cselect_b32 s[sgprStreamKIter], s97, s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s98, s[sgprStreamKIterEnd] // Set end iter +s_and_b32 s97, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s97, s97, s[sgprItersPerTile] // Total SK iters +s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s97 // Cap ending iter at total SK iters label_SK_InitDone: s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW // Only branch on scc0 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_KernelEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_KernelEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_KernelEnd label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ @@ -902,19 +852,15 @@ label_NoBranch_T8JHFHKM7BO5OHXW: /******************************************/ label_PersistentLoopStart: +// Use sgprScalarGlobalReadOffsetA/B sgprs +.set sgpr102, 84 +.set sgpr103, 85 +.set sgpr104, 86 + /******************************************/ /* Begin setupNewTile */ /******************************************/ -// Use sgprScalarGlobalReadOffsetA/B sgprs -.set sgpr104, 88 -.set sgpr105, 89 -.set sgpr106, 90 -.set sgpr107, 91 -.set sgpr108, 92 -.set sgpr109, 93 -.set sgpr110, 94 - /* global read addresses: work-group */ /* graWorkGroup mapping */ @@ -928,78 +874,106 @@ v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v4 // Set LRA to first b v_xor_b32 v4, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v4 // Set LRA to first buffer offset /* StreamK calculate tile idx and map to WG */ -s_mul_hi_u32 s[sgpr105], s[sgprStreamKIter], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s[sgpr104], s[sgprStreamKIter], s[sgpr106] // s_magic mul, div alg 2 -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_and_b32 s[sgpr106], s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr104], s[sgpr104], s[sgpr106] // sMagicDiv Alg 2 -s_mul_i32 s[sgpr105], s[sgpr104], s[sgprItersPerTile] // Tile start iteration -s_add_u32 s[sgpr106], s[sgpr105], s[sgprItersPerTile] // Tile end iteration -s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s[sgpr105] // Local iteration start -s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s[sgpr106] // 1. (Local) iteration end (SK tile) -s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s[sgpr105] // 2. Local iteration end (SK tile) -s_mul_i32 s[sgpr107], s[sgprskTiles], s[sgprItersPerTile] // Total SK iters -s_sub_u32 s[sgpr107], s[sgprTotalIters], s[sgpr107] // Offset to first SK tile -s_mul_i32 s[sgpr105], s[sgprskGrid], s[sgprItersPerTile] // DP iterations shift -s_add_u32 s[sgpr105], s[sgpr105], s[sgprStreamKIter] // Add DP shift -s_cmp_lt_u32 s[sgpr105], s[sgpr107] // Check if still in DP section +v_cvt_f32_u32 v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_rcp_iflag_f32 v4, v4 // StreamKIter // ItersPerTile +v_cvt_f32_u32 v5, s[sgprStreamKIter] // StreamKIter // ItersPerTile +v_mul_f32 v4, v4, v5 // StreamKIter // ItersPerTile +v_cvt_u32_f32 v4, v4 // StreamKIter // ItersPerTile +v_mul_u32_u24 v5, v4, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_sub_u32 v5, s[sgprStreamKIter], v5 // StreamKIter // ItersPerTile +v_cmpx_eq_u32 exec, v5, s[sgprItersPerTile] // StreamKIter // ItersPerTile +v_add_u32 v4, 1, v4 // StreamKIter // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s98, s[sgprItersPerTile] // Tile start iteration +s_add_u32 s100, s99, s[sgprItersPerTile] // Tile end iteration +s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s99 // Local iteration start +s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s100 // 1. (Local) iteration end (SK tile) +s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s99 // 2. Local iteration end (SK tile) +s_and_b32 s101, s[sgprskGridAndTiles], 0xffff // Get skTiles +s_mul_i32 s101, s101, s[sgprItersPerTile] // Total SK iters +s_sub_u32 s101, s[sgprTotalIters], s101 // Offset to first SK tile +s_lshr_b32 s99, s[sgprskGridAndTiles], 0x10 // Get skGrid +s_mul_i32 s99, s99, s[sgprItersPerTile] // DP iterations shift +s_add_u32 s99, s99, s[sgprStreamKIter] // Add DP shift +s_cmp_lt_u32 s99, s101 // Check if still in DP section s_cbranch_scc1 label_SK_UpdateDone // Done update -s_mov_b32 s[sgpr105], s[sgpr106] // SK iterations shift -s_cmp_le_u32 s[sgpr107], s[sgprStreamKIter] // Check if continuing in SK section +s_mov_b32 s99, s100 // SK iterations shift +s_cmp_le_u32 s101, s[sgprStreamKIter] // Check if continuing in SK section s_cbranch_scc1 label_SK_UpdateDone // Done update s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters) s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters) -s_add_u32 s[sgpr109], s[sgprSKItersPerWG], 1 // Spread out extra iterations -s_mul_i32 s[sgpr108], s[sgprStreamKIdx], s[sgpr109] // StreamK starting iteration (case: before extra iters) -s_add_u32 s[sgpr109], s[sgpr108], s[sgpr109] // StreamK ending iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgprSKItersPerWG], 1 // Spread out extra iterations +s_mul_i32 s[sgpr102], s[sgprStreamKIdx], s[sgpr103] // StreamK starting iteration (case: before extra iters) +s_add_u32 s[sgpr103], s[sgpr102], s[sgpr103] // StreamK ending iteration (case: before extra iters) s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration -s_cselect_b32 s[sgprStreamKIter], s[sgpr108], s[sgprStreamKIter] // Set start iter -s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr109], s[sgprStreamKIterEnd] // Set end iter -s_add_u32 s[sgpr105], s[sgprStreamKIter], s[sgpr107] // Offset to start of SK section -s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgpr107] // Offset to start of SK section +s_cselect_b32 s[sgprStreamKIter], s[sgpr102], s[sgprStreamKIter] // Set start iter +s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr103], s[sgprStreamKIterEnd] // Set end iter +s_add_u32 s99, s[sgprStreamKIter], s101 // Offset to start of SK section +s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s101 // Offset to start of SK section s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_KernelEnd, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_KernelEnd +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_KernelEnd, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_KernelEnd label_NoBranch_S4FDBQ587JJL6NOU: label_SK_UpdateDone: -s_mov_b32 s[sgprStreamKIter], s[sgpr105] // Store current iteration +s_mov_b32 s[sgprStreamKIter], s99 // Store current iteration /* Map StreamK tile index to wg0/1/2 */ -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumProblemNumGroupTiles0By1] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0By1], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup2], s[sgpr105] // wg2 = Tile Idx / problemNumGroupTiles0By1 -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_mul_i32 s[sgpr105], s[sgpr105], s[sgprNumWorkGroups1] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgpr104], s[sgpr104], s[sgpr105] // remainder -s_mul_hi_u32 s[sgpr106], s[sgpr104], s[sgprMagicNumberProblemNumGroupTiles0] // s_magic mul, div alg 2 -s_lshr_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 31 // tmpS = extract abit -s_mul_i32 s[sgpr105], s[sgpr104], s[sgpr107] // s_magic mul, div alg 2 -s_add_u32 s[sgpr105], s[sgpr105], s[sgpr106] -s_and_b32 s[sgpr107], s[sgprMagicShiftProblemNumGroupTiles0], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s[sgpr105], s[sgpr105], s[sgpr107] // sMagicDiv Alg 2 -s_mov_b32 s[sgprWorkGroup1], s[sgpr105] // wg1 = Tile Idx / problemNumGroupTiles0 -s_mul_i32 s[sgprWorkGroup0], s[sgpr105], s[sgprNumWorkGroups0] // remainder part 1 : quotient * divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr104], s[sgprWorkGroup0] // wg0 = Tile Idx % problemNumGroupTiles0 +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles +v_cvt_f32_u32 v4, s99 // TileID // nWG0*nWG1 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0*nWG1 +v_cvt_f32_u32 v5, s98 // TileID // nWG0*nWG1 +v_mul_f32 v4, v4, v5 // TileID // nWG0*nWG1 +v_cvt_u32_f32 v4, v4 // TileID // nWG0*nWG1 +v_mul_u32_u24 v5, v4, s99 // TileID // nWG0*nWG1 +v_sub_u32 v5, s98, v5 // TileID // nWG0*nWG1 +v_cmpx_eq_u32 exec, v5, s99 // TileID // nWG0*nWG1 +v_add_u32 v4, 1, v4 // TileID // nWG0*nWG1 +v_mov_b32 v5, 0 // TileID // nWG0*nWG1 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s99 // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s99 // re-calculate remainder +v_sub_u32 v5, s98, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup2], v4 // quotient +v_readfirstlane_b32 s100, v5 // remainder +v_cvt_f32_u32 v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_rcp_iflag_f32 v4, v4 // TileID // nWG0 +v_cvt_f32_u32 v5, s100 // TileID // nWG0 +v_mul_f32 v4, v4, v5 // TileID // nWG0 +v_cvt_u32_f32 v4, v4 // TileID // nWG0 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // TileID // nWG0 +v_sub_u32 v5, s100, v5 // TileID // nWG0 +v_cmpx_eq_u32 exec, v5, s[sgprNumWorkGroups0] // TileID // nWG0 +v_add_u32 v4, 1, v4 // TileID // nWG0 +v_mov_b32 v5, 0 // TileID // nWG0 +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v5, s[sgprNumWorkGroups0] // overflow happened in remainder +v_sub_u32 v4, v4, 1 // quotient - 1 +v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0] // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient +v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck // branch if s[Alpha] != 0 s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO // Only branch on scc0 -s_getpc_b64 s[sgpr108:sgpr109] // addr of next instr -s_add_i32 s[sgpr110], label_GW_End, 4 // target branch offset -s_add_u32 s[sgpr108], s[sgpr108], s[sgpr110] // add target branch offset -s_addc_u32 s[sgpr109], s[sgpr109], 0 // add high and carry -s_setpc_b64 s[sgpr108:sgpr109] // branch to label_GW_End +s_getpc_b64 s[sgpr102:sgpr103] // addr of next instr +s_add_i32 s[sgpr104], label_GW_End, 4 // target branch offset +s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104] // add target branch offset +s_addc_u32 s[sgpr103], s[sgpr103], 0 // add high and carry +s_setpc_b64 s[sgpr102:sgpr103] // branch to label_GW_End label_NoBranch_UR8VN3A1SJCPC6PO: s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations label_SKAlphaCheck: @@ -1008,130 +982,130 @@ s_cmp_gt_i32 s[sgprWGM], 1 // WGM > 1 ? s_cbranch_scc1 label_WGMPositive // branch if WGM > 1 s_cmp_ge_i32 s[sgprWGM], 0 // WGM >= 0 ? s_cbranch_scc1 label_WGM // branch if WGM >= 0 -s_abs_i32 s[sgpr108], s[sgprWGM] // abs(WGM) -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_abs_i32 s101, s[sgprWGM] // abs(WGM) +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup0], s[sgpr107] // WorkGroup0=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup0], s100 // WorkGroup0=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups1] // (wg1 % WGM)*NumWorkGroups1 +s_add_u32 s100, s100, s[sgprWorkGroup1] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups0] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups0], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups0], s[sgpr105] // NumWorkGroups0=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups0], s99 // NumWorkGroups0=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup1] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup1] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup1] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup1] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup1] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup0] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup1], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup0], v5 // remainder -s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup0], s[sgpr107], s[sgprWorkGroup0] // WorkGroup0=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup0], s100, s[sgprWorkGroup0] // WorkGroup0=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s97 // wg1 += blockId * WGM s_branch label_WGM label_WGMPositive: -s_mov_b32 s[sgpr108], s[sgprWGM] // WGM -v_cvt_f32_u32 v4, s[sgpr108] // WGM +s_mov_b32 s101, s[sgprWGM] // WGM +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprWorkGroup1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprWorkGroup1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr106], v4 // quotient -s_mul_i32 s[sgpr107], s[sgpr106], s[sgpr108] // quotient * non-magic divisor -s_sub_u32 s[sgpr107], s[sgprWorkGroup1], s[sgpr107] // WorkGroup1=remainder -s_mul_i32 s[sgpr107], s[sgpr107], s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 -s_add_u32 s[sgpr107], s[sgpr107], s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 -v_cvt_f32_u32 v4, s[sgpr108] // WGM +v_readfirstlane_b32 s97, v4 // quotient +s_mul_i32 s100, s97, s101 // quotient * non-magic divisor +s_sub_u32 s100, s[sgprWorkGroup1], s100 // WorkGroup1=remainder +s_mul_i32 s100, s100, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s100, s100, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v4, s101 // WGM v_rcp_iflag_f32 v4, v4 // WGM v_cvt_f32_u32 v5, s[sgprNumWorkGroups1] // WGM v_mul_f32 v4, v4, v5 // WGM v_cvt_u32_f32 v4, v4 // WGM -v_mul_u32_u24 v5, v4, s[sgpr108] // WGM +v_mul_u32_u24 v5, v4, s101 // WGM v_sub_u32 v5, s[sgprNumWorkGroups1], v5 // WGM -v_cmpx_eq_u32 exec, v5, s[sgpr108] // WGM +v_cmpx_eq_u32 exec, v5, s101 // WGM v_add_u32 v4, 1, v4 // WGM s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr108] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s101 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 s_mov_b64 exec, -1 // Reset exec -v_readfirstlane_b32 s[sgpr104], v4 // quotient -s_mul_i32 s[sgpr105], s[sgpr108], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgpr105], s[sgprNumWorkGroups1], s[sgpr105] // NumWorkGroups1=remainder -s_cmp_eq_u32 s[sgpr105], 0 // remainder == 0 ? -s_cmov_b32 s[sgpr105], s[sgpr108] // remainder = WGM if remainder == 0 -s_cmp_ge_u32 s[sgpr106], s[sgpr104] // blockId >= numFullBlocks ? -s_cselect_b32 s[sgpr104], s[sgpr105], s[sgpr108] -v_cvt_f32_u32 v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_f32_u32 v5, s[sgpr107] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mul_u32_u24 v5, v4, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_sub_u32 v5, s[sgpr107], v5 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_cmpx_eq_u32 exec, v5, s[sgpr104] // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s[sgpr107] / s[sgpr104] -v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s[sgpr107] % s[sgpr104] +v_readfirstlane_b32 s98, v4 // quotient +s_mul_i32 s99, s101, s98 // quotient * non-magic divisor +s_sub_u32 s99, s[sgprNumWorkGroups1], s99 // NumWorkGroups1=remainder +s_cmp_eq_u32 s99, 0 // remainder == 0 ? +s_cmov_b32 s99, s101 // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s97, s98 // blockId >= numFullBlocks ? +s_cselect_b32 s98, s99, s101 +v_cvt_f32_u32 v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_rcp_iflag_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_f32_u32 v5, s100 // s[sgprWorkGroup0] = s100 / s98 +v_mul_f32 v4, v4, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cvt_u32_f32 v4, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mul_u32_u24 v5, v4, s98 // s[sgprWorkGroup0] = s100 / s98 +v_sub_u32 v5, s100, v5 // s[sgprWorkGroup0] = s100 / s98 +v_cmpx_eq_u32 exec, v5, s98 // s[sgprWorkGroup0] = s100 / s98 +v_add_u32 v4, 1, v4 // s[sgprWorkGroup0] = s100 / s98 +v_mov_b32 v5, 0 // s[sgprWorkGroup1] = s100 % s98 s_mov_b64 exec, -1 // Reset exec -v_cmpx_gt_u32 exec, v5, s[sgpr104] // overflow happened in remainder +v_cmpx_gt_u32 exec, v5, s98 // overflow happened in remainder v_sub_u32 v4, v4, 1 // quotient - 1 -v_mul_u32_u24 v5, v4, s[sgpr104] // re-calculate remainder -v_sub_u32 v5, s[sgpr107], v5 // re-calculate remainder +v_mul_u32_u24 v5, v4, s98 // re-calculate remainder +v_sub_u32 v5, s100, v5 // re-calculate remainder s_mov_b64 exec, -1 // Reset exec v_readfirstlane_b32 s[sgprWorkGroup0], v4 // quotient v_readfirstlane_b32 s[sgprWorkGroup1], v5 // remainder -s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s[sgpr104] // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup1], s[sgpr107], s[sgprWorkGroup1] // WorkGroup1=remainder -s_mul_i32 s[sgpr106], s[sgpr106], s[sgpr108] // blockId * WGM -s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s[sgpr106] // wg1 += blockId * WGM +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s98 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s100, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s97, s97, s101 // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s97 // wg1 += blockId * WGM label_WGM: /******************************************/ @@ -1165,8 +1139,8 @@ v_lshl_add_u32 v5, v7, 13, v5 // 7. wave offset in M dimen: /* local read addresses: final offsets a */ v_lshrrev_b32 v6, 6, v[vgprSerial] // 6 = Serial / 64 v_lshrrev_b32 v6, 2, v6 // LSU offset: Get LSU wave_id -s_mov_b32 s[sgpr104], 64 // LSU offset: stride = lsuStride(64) when umlds==True -v_mul_lo_u32 v6, s[sgpr104], v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) +s_mov_b32 s97, 64 // LSU offset: stride = lsuStride(64) when umlds==True +v_mul_lo_u32 v6, s97, v6 // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD) v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1 // Final Offset: offset = (lro0+lsuoffset)*bpeDS v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024 @@ -1175,7 +1149,7 @@ v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offs v_lshrrev_b32 v4, 6, v[vgprSerial] // 4 = Serial / 64 v_lshrrev_b32 v4, 2, v4 // LSU offset: Get LSU wave_id // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.) -v_mul_lo_u32 v4, s[sgpr104], v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) +v_mul_lo_u32 v4, s97, v4 // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD) v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1 // Final Offset: offset = (lro1+lsuoffset)*bpeDS v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024 @@ -1290,108 +1264,80 @@ s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0 s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim) s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element -// Use sgprScalarGlobalReadOffsetA sgprs -.set sgpr104, sgprSKItersPerWG // skitersperwg, overwrite, 54 -.set sgpr105, sgprskGrid // skgrid, overwrite, 55 -.set sgpr106, sgprMagicNumberProblemNumGroupTiles0 // sgprMagicNumberProblemNumGroupTiles0, 46 -.set sgpr107, sgprMagicShiftProblemNumGroupTiles0 // sgprMagicShiftProblemNumGroupTiles0, 47 -.set sgpr108, sgprMagicShiftItersPerTile // sgprMagicShiftItersPerTile, 50 -.set sgpr109, sgprMagicNumProblemNumGroupTiles0By1 // sgprMagicNumProblemNumGroupTiles0By1, 51 -.set sgpr110, sgprWGM // wgm, 7 - -// Save sgpr values to vgpr -v_writelane_b32 v255, s[sgprSKItersPerWG], 0 -s_nop 0 -v_writelane_b32 v255, s[sgprskGrid], 1 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumberProblemNumGroupTiles0], 2 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftProblemNumGroupTiles0], 3 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicShiftItersPerTile], 4 -s_nop 0 -v_writelane_b32 v255, s[sgprMagicNumProblemNumGroupTiles0By1], 5 -s_nop 0 -v_writelane_b32 v255, s[sgprWGM], 6 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress], 7 -s_nop 0 -v_writelane_b32 v255, s[sgprKernArgAddress+1], 8 - /* global read addresses: addresses a */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup0], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideAL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideAL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup0], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideA0I] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideAL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideAL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideAL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideAL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeI], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideA0I], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideAL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideAL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeI], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideA0I], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideA0I], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s101 // sub tileStart s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart -s_lshl_b64 s[sgpr106:sgpr107], s[sgpr106:sgpr107], 1 // tileStart *= BPE -s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_mul_hi_u32 s99, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart +s_lshl_b64 s[100:101], s[100:101], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: addresses b */ /* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s[sgpr107], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_i32 s[sgpr106], s[sgprWorkGroup1], 256 // WorkGroup[01] * MT -s_mul_hi_u32 s[sgpr107], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr106], s[sgpr106], s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s[sgpr104], s[sgprStreamKLocalStart], DepthU // StreamK tile start offset -s_mul_hi_u32 s[sgpr105], s[sgpr104], constStrideBL // StreamK tile start offset -s_mul_i32 s[sgpr104], s[sgpr104], constStrideBL // StreamK tile start offset -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum GsuOffset term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum GsuOffset term to tilestart +s_mul_hi_u32 s101, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_i32 s100, s[sgprWorkGroup1], 256 // WorkGroup[01] * MT +s_mul_hi_u32 s101, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s100, s100, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride +s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU // StreamK tile start offset +s_mul_hi_u32 s99, s98, constStrideBL // StreamK tile start offset +s_mul_i32 s98, s98, constStrideBL // StreamK tile start offset +s_add_u32 s100, s100, s98 // accum GsuOffset term to tilestart +s_addc_u32 s101, s101, s99 // accum GsuOffset term to tilestart s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size -s_sub_u32 s[sgpr104], s[sgprSizeL], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], constStrideBL, s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], constStrideBL, s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgpr104], s[sgprSizeJ], 1 // (size-1) -s_mul_hi_u32 s[sgpr105], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_mul_i32 s[sgpr104], s[sgprStrideB1J], s[sgpr104] // stride x (size-1) -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // sum tensor size -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // sum tensor size -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr106] // sub tileStart -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr107] // sub tileStart +s_sub_u32 s98, s[sgprSizeL], 1 // (size-1) +s_mul_hi_u32 s99, constStrideBL, s98 // stride x (size-1) +s_mul_i32 s98, constStrideBL, s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s98, s[sgprSizeJ], 1 // (size-1) +s_mul_hi_u32 s99, s[sgprStrideB1J], s98 // stride x (size-1) +s_mul_i32 s98, s[sgprStrideB1J], s98 // stride x (size-1) +s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size +s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s100 // sub tileStart +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s101 // sub tileStart s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s[sgpr105], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s[sgpr104], s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s[sgpr106], s[sgpr106], s[sgpr104] // accum wg term to tilestart -s_addc_u32 s[sgpr107], s[sgpr107], s[sgpr105] // accum wg term to tilestart -s_lshl_b64 s[sgpr106:sgpr107], s[sgpr106:sgpr107], 1 // tileStart *= BPE -s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgpr106] // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgpr107] // SRD base = Address+ tileStart1 +s_mul_hi_u32 s99, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s98, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s100, s100, s98 // accum wg term to tilestart +s_addc_u32 s101, s101, s99 // accum wg term to tilestart +s_lshl_b64 s[100:101], s[100:101], 1 // tileStart *= BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s100 // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s101 // SRD base = Address+ tileStart1 s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD /* global read addresses: increments a */ @@ -1405,87 +1351,87 @@ v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0 // s[Alpha] == 0.0f ? s_cbranch_vccz label_SKAlphaCheck2 // branch if s[Alpha] != 0 s_mov_b32 s[sgprLoopCounterL], 0 // Skip iterations label_SKAlphaCheck2: -s_and_b32 s[sgpr105], 63, s[sgprSizesSum+0] // s[sgpr105] = s[sgprSizesSum+0] % 64 -s_cmp_eq_u32 s[sgpr105], 0 // numIterL == 0 -s_cselect_b32 s[sgpr104], 0, 1 // check if size uses tail loop +s_and_b32 s99, 63, s[sgprSizesSum+0] // s99 = s[sgprSizesSum+0] % 64 +s_cmp_eq_u32 s99, 0 // numIterL == 0 +s_cselect_b32 s98, 0, 1 // check if size uses tail loop s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile -s_cselect_b32 s[sgpr104], s[sgpr104], 0 // this WG runs tail loop -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s[sgpr104] // Adjust loop counter for tail loop +s_cselect_b32 s98, s98, 0 // this WG runs tail loop +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s98 // Adjust loop counter for tail loop s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter -s_and_b32 s[sgpr106], s[sgprStaggerU], 0x1f00 -s_lshr_b32 s[sgpr106], s[sgpr106], 0x8 -s_and_b32 s[sgpr107], s[sgprStaggerU], 0xe000 +s_and_b32 s100, s[sgprStaggerU], 0x1f00 +s_lshr_b32 s100, s100, 0x8 +s_and_b32 s101, s[sgprStaggerU], 0xe000 s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff -s_mov_b32 s[sgpr104], s[sgprStaggerU] // init staggerU +s_mov_b32 s98, s[sgprStaggerU] // init staggerU label_beginStaggerUIter: -s_lshl_b32 s[sgpr105], s[sgpr104], s[sgpr106] // shift by StaggerUStride -s_cmp_ge_u32 s[sgprOrigLoopCounter], s[sgpr105] // loopCount >= current shift Count +s_lshl_b32 s99, s98, s100 // shift by StaggerUStride +s_cmp_ge_u32 s[sgprOrigLoopCounter], s99 // loopCount >= current shift Count s_cbranch_scc1 label_endStaggerUIter // jump to end -s_lshr_b32 s[sgpr104], s[sgpr104], 1 // step down to smaller stagger +s_lshr_b32 s98, s98, 1 // step down to smaller stagger s_branch label_beginStaggerUIter // jump to begin label_endStaggerUIter: -s_sub_u32 s[sgpr105], s[sgpr104], 1 // staggerU mask -s_cmp_ge_u32 s[sgpr104], 1 // if current staggerU >= 1 -s_cselect_b32 s[sgprStaggerUIter], s[sgpr105], 0 // set Mask -s_cmp_eq_u32 s[sgpr107], 0x0 +s_sub_u32 s99, s98, 1 // staggerU mask +s_cmp_ge_u32 s98, 1 // if current staggerU >= 1 +s_cselect_b32 s[sgprStaggerUIter], s99, 0 // set Mask +s_cmp_eq_u32 s101, 0x0 s_cbranch_scc1 label_StaggerUMapping_1 -s_mov_b32 s[sgpr104], s[sgprWorkGroup0] +s_mov_b32 s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_1: -s_cmp_eq_u32 s[sgpr107], 0x2000 +s_cmp_eq_u32 s101, 0x2000 s_cbranch_scc1 label_StaggerUMapping_2 -s_mov_b32 s[sgpr104], s[sgprWorkGroup1] +s_mov_b32 s98, s[sgprWorkGroup1] s_branch label_staggerInputEnd label_StaggerUMapping_2: -s_cmp_eq_u32 s[sgpr107], 0x4000 +s_cmp_eq_u32 s101, 0x4000 s_cbranch_scc1 label_StaggerUMapping_3 -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_StaggerUMapping_3: -s_cmp_eq_u32 s[sgpr107], 0x6000 +s_cmp_eq_u32 s101, 0x6000 s_cbranch_scc1 label_StaggerUMapping_4 -s_mul_i32 s[sgpr105], s[sgprNumWorkGroups0], s[sgprWorkGroup1] -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr105] -s_add_u32 s[sgpr104], s[sgpr104], s[sgprWorkGroup0] +s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprWorkGroup1] +s_add_u32 s98, s98, s99 +s_add_u32 s98, s98, s[sgprWorkGroup0] s_branch label_staggerInputEnd label_StaggerUMapping_4: -s_cmp_eq_u32 s[sgpr107], 0x8000 +s_cmp_eq_u32 s101, 0x8000 s_cbranch_scc1 label_staggerInputEnd -s_mov_b32 s[sgpr104], -0x1 +s_mov_b32 s98, -0x1 s_branch label_staggerInputEnd label_staggerInputEnd: -s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr104] // Compute actual stagger start for this tile -s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s[sgpr106] // shift by StaggerUStride +s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s98 // Compute actual stagger start for this tile +s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s100 // shift by StaggerUStride s_cmp_gt_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cmov_b32 s[sgprStaggerUIter], 0 // set stagger=0 for partial tiles /* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ -s_mul_hi_i32 s[sgpr105], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_i32 s[sgpr104], s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset +s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap @@ -1535,26 +1481,26 @@ s_add_u32 m0, m0, 4160 // Move LDS write address to buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0 /* global read inc A loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 /* global read inc B loopL */ -s_add_u32 s[sgpr106], s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s[sgpr106] // Is this wrapIter? (pf) -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_add_u32 s100, s[sgprLoopCounterL], 1 // remove pf(1) +s_cmp_eq_u32 s[sgprStaggerUIter], s100 // Is this wrapIter? (pf) +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 @@ -1571,28 +1517,28 @@ s_mov_b32 s[sgprSrdC+2], BufferOOB s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s[sgpr106], MT1, s[sgprWorkGroup1] // <- wg1*MT1 -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideC1J] // ScaleC s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_mul_i32 s[sgpr104], s[sgpr106], s[sgprStrideD1J] // ScaleD s[sgpr106] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s[sgpr105] // add hi to SRD - -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s[sgpr105] // add hi to SRD -s_mul_hi_u32 s[sgpr105], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_mul_i32 s[sgpr104], s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride -s_lshl_b64 s[sgpr104:sgpr105], s[sgpr104:sgpr105], 1 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgpr104] // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgpr105] // add hi to SRD +s_mul_i32 s100, MT1, s[sgprWorkGroup1] // <- wg1*MT1 +s_mul_hi_u32 s99, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideC1J] // ScaleC s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_mul_i32 s98, s100, s[sgprStrideD1J] // ScaleD s100 by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s99 // add hi to SRD + +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s99 // add hi to SRD +s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride +s_lshl_b64 s[98:99], s[98:99], 1 // scale by bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s98 // add lo to SRD +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s99 // add hi to SRD // Init C @@ -1635,11 +1581,11 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? /* after InitC, skip to end of prefetch last iter if numIter==0 */ s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU // Only branch on scc1 -s_getpc_b64 s[sgpr104:sgpr105] // addr of next instr -s_add_i32 s[sgpr106], label_PrefetchGlobalLastIterEnd, 4 // target branch offset -s_add_u32 s[sgpr104], s[sgpr104], s[sgpr106] // add target branch offset -s_addc_u32 s[sgpr105], s[sgpr105], 0 // add high and carry -s_setpc_b64 s[sgpr104:sgpr105] // branch to label_PrefetchGlobalLastIterEnd +s_getpc_b64 s[98:99] // addr of next instr +s_add_i32 s100, label_PrefetchGlobalLastIterEnd, 4 // target branch offset +s_add_u32 s98, s98, s100 // add target branch offset +s_addc_u32 s99, s99, 0 // add high and carry +s_setpc_b64 s[98:99] // branch to label_PrefetchGlobalLastIterEnd label_NoBranch_8S4L1KCK9VFC7AQU: s_waitcnt vmcnt(0) // wait for global read s_barrier // For stream-k / persistent loop @@ -1655,7 +1601,6 @@ s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap R s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // PGR=2 but only 1 loop s_cbranch_scc1 label_skipPGR2 // PGR=2 but only 1 loop - s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0 s_add_u32 m0, m0, 4160 // Move LDS write address to next line @@ -1721,6 +1666,14 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 s_waitcnt lgkmcnt(0) + +/* local read inc a */ +/* N/A, lro->32 */ +/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ + +/* local read inc b */ +/* N/A, lro->32 */ +/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ /******************************************/ /* Unrolled Loop(s) - Begin */ @@ -1730,7 +1683,7 @@ s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter s_cbranch_scc1 label_toPGR1 // PGR=2 but only 1 loop, toPGR1 s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter s_cbranch_scc1 label_LoopEndL // do not enter LoopL - +label_LoopBeginL: // MAIN LOOP MACRO - Shared code between Even/Odd simds .macro MAINLOOP isOdd @@ -1742,27 +1695,27 @@ ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] off v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? /* mfmaIndex:2 */ v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:3 */ v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) /* mfmaIndex:4 */ v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:5 */ v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) /* mfmaIndex:6 */ v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:7 */ v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? /* mfmaIndex:8 */ v_mfma_f32_16x16x32_f16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0] @@ -1777,25 +1730,25 @@ v_mfma_f32_16x16x32_f16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+ ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:11 */ v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? /* mfmaIndex:12 */ v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:13 */ v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:14 */ v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 /* mfmaIndex:15 */ v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] s_mov_b32 m0, s[sgprLocalWriteAddrA] // m0 <- LDS write address -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) /* mfmaIndex:16 */ v_mfma_f32_16x16x32_f16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0] -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? /* mfmaIndex:17 */ v_mfma_f32_16x16x32_f16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0] @@ -2432,8 +2385,8 @@ v_mfma_f32_16x16x32_f16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+2 // EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path -s_getreg_b32 s[sgpr104], hwreg(HW_REG_HW_ID, 4, 1) -s_cmp_eq_u32 s[sgpr104], 0 +s_getreg_b32 s98, hwreg(HW_REG_HW_ID, 4, 1) +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_LoopBeginL1 /******************************************/ @@ -2455,8 +2408,6 @@ s_cbranch_scc0 label_LoopBeginL1 // restart LoopL label_LoopEndL: -/* Before NLL: Check VGPR.checkin for INT8 LW */ - /******************************************/ /* Ord. NoGlobalLoadLoop - Begin */ /******************************************/ @@ -2473,7 +2424,7 @@ ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] off v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] /* global read inc A loopL */ s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s[sgpr104], s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? +s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? /* mfmaIndex:2 */ v_mfma_f32_16x16x32_f16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0] @@ -2481,8 +2432,8 @@ ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] off /* mfmaIndex:3 */ v_mfma_f32_16x16x32_f16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0] -s_cselect_b32 s[sgpr105], s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) +s_cselect_b32 s99, s[sgprWrapUA+1], 0 // incUpper <- ? +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) /* mfmaIndex:4 */ v_mfma_f32_16x16x32_f16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0] @@ -2490,8 +2441,8 @@ ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] off /* mfmaIndex:5 */ v_mfma_f32_16x16x32_f16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0] -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) /* mfmaIndex:6 */ v_mfma_f32_16x16x32_f16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0] @@ -2499,7 +2450,7 @@ ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] off /* mfmaIndex:7 */ v_mfma_f32_16x16x32_f16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0] -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? /* mfmaIndex:8 */ @@ -2518,8 +2469,8 @@ ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] o /* mfmaIndex:11 */ v_mfma_f32_16x16x32_f16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0] -s_cselect_b32 s[sgpr104], s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s[sgpr105], s[sgprWrapUB+1], 0 // incUpper <- ? +s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? +s_cselect_b32 s99, s[sgprWrapUB+1], 0 // incUpper <- ? /* mfmaIndex:12 */ v_mfma_f32_16x16x32_f16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0] @@ -2527,8 +2478,8 @@ ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] o /* mfmaIndex:13 */ v_mfma_f32_16x16x32_f16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0] -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) /* mfmaIndex:14 */ v_mfma_f32_16x16x32_f16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0] @@ -2536,8 +2487,8 @@ ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] o /* mfmaIndex:15 */ v_mfma_f32_16x16x32_f16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0] -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) /* mfmaIndex:16 */ /* localReadsVacancy: latencyLeft 1 */ @@ -3259,12 +3210,12 @@ label_PrefetchGlobalLastIterEnd: /******************************************/ /* local write reset offsets a */ -s_xor_b32 s[sgpr104], s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s97 // Set LWA to first buffer offset /* local write reset offsets b */ -s_xor_b32 s[sgpr104], s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value -s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgpr104] // Set LWA to first buffer offset +s_xor_b32 s97, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value +s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s97 // Set LWA to first buffer offset /* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */ .set vgprG2LA_BASE, 4 .set vgprG2LA, vgprG2LA_BASE+0 @@ -3283,56 +3234,56 @@ s_mov_b32 s[sgprOrigLoopCounter], 0 // repurpose to count each lo s_cbranch_scc1 label_SkipTailLoopL // skip to end of tail loop b/c numIter==0 /* remove stagger offsets for tail loop */ -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL label_Negative_J5DQFVGFWLXU2DUR: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_DLSAQLEVYLOBCPNL: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUA] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUA+1] // S - WrapU -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUA] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUA+1] // S - WrapU +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_sub_i32 s[sgpr104], 3, s[sgprStaggerUIter] -s_cmp_ge_i32 s[sgpr104], 0 +s_sub_i32 s98, 3, s[sgprStaggerUIter] +s_cmp_ge_i32 s98, 0 s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1 -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB label_Negative_LQI6BOBE0EY8XIP1: -s_abs_i32 s[sgpr104], s[sgpr104] -s_mul_hi_u32 s[sgpr105], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s[sgpr104], s[sgpr104], s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_xor_b32 s[sgpr104], s[sgpr104], 0xffffffff -s_xor_b32 s[sgpr105], s[sgpr105], 0xffffffff -s_add_u32 s[sgpr104], s[sgpr104], 0x1 -s_addc_u32 s[sgpr105], s[sgpr105], 0 +s_abs_i32 s98, s98 +s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0] // start offset S in bytes +s_xor_b32 s98, s98, 0xffffffff +s_xor_b32 s99, s99, 0xffffffff +s_add_u32 s98, s98, 0x1 +s_addc_u32 s99, s99, 0 label_MultiplyDone_9N1QELR2XL4Z0HRB: -s_sub_u32 s[sgpr104], s[sgpr104], s[sgprWrapUB] // S - WrapU -s_subb_u32 s[sgpr105], s[sgpr105], s[sgprWrapUB+1] // S - WrapU -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgpr104] // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgpr105] // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgpr104] // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgpr105] // limit -= inc) +s_sub_u32 s98, s98, s[sgprWrapUB] // S - WrapU +s_subb_u32 s99, s99, s[sgprWrapUB+1] // S - WrapU +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98 // gra SRD += inc(lower) +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99 // gra SRD += inc(upper) +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc) +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc) s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 // Check if K even/odd -s_and_b32 s84, s[sgprSizesSum], 1 -s_cmp_eq_u32 s84, 0 +s_and_b32 s98, s[sgprSizesSum], 1 +s_cmp_eq_u32 s98, 0 s_cbranch_scc0 label_tailloop_non_dtl label_tailloop_dtl: @@ -3711,286 +3662,286 @@ ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x40 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) +s_mov_b32 s97, 0x40 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 7 // get inputs for edge thread -s_sub_u32 s[sgpr106], 8, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 4 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 7 // get inputs for edge thread +s_sub_u32 s97, 8, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 4 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+4+0+0+0:vgprValuA_X0_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+4+0+0+2:vgprValuA_X0_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+4+0+0+0:vgprValuA_X0_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+4+0+0+2:vgprValuA_X0_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+2], v[vgprValuA_X0_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+12+0+0+0:vgprValuA_X0_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+12+0+0+2:vgprValuA_X0_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+12+0+0+0:vgprValuA_X0_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+12+0+0+2:vgprValuA_X0_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+0], v[vgprValuA_X0_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+2], v[vgprValuA_X0_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+20+0+0+0:vgprValuA_X0_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+20+0+0+2:vgprValuA_X0_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+20+0+0+0:vgprValuA_X0_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+20+0+0+2:vgprValuA_X0_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+0], v[vgprValuA_X0_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+2], v[vgprValuA_X0_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X0_I0+28+0+0+0:vgprValuA_X0_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X0_I0+28+0+0+2:vgprValuA_X0_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+28+0+0+0:vgprValuA_X0_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+28+0+0+2:vgprValuA_X0_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+0], v[vgprValuA_X0_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+2], v[vgprValuA_X0_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+4+0+0+0:vgprValuB_X0_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+4+0+0+2:vgprValuB_X0_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+4+0+0+0:vgprValuB_X0_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+4+0+0+2:vgprValuB_X0_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+2], v[vgprValuB_X0_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+3], v[vgprValuB_X0_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+12+0+0+0:vgprValuB_X0_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+12+0+0+2:vgprValuB_X0_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+12+0+0+0:vgprValuB_X0_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+12+0+0+2:vgprValuB_X0_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+2], v[vgprValuB_X0_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+3], v[vgprValuB_X0_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+20+0+0+0:vgprValuB_X0_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+20+0+0+2:vgprValuB_X0_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+20+0+0+0:vgprValuB_X0_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+20+0+0+2:vgprValuB_X0_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+2], v[vgprValuB_X0_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+3], v[vgprValuB_X0_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X0_I0+28+0+0+0:vgprValuB_X0_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X0_I0+28+0+0+2:vgprValuB_X0_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+28+0+0+0:vgprValuB_X0_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+28+0+0+2:vgprValuB_X0_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+2], v[vgprValuB_X0_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+3], v[vgprValuB_X0_I0+28+0+0+3], v141, s[98:99] s_nop 1 v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] @@ -4086,286 +4037,286 @@ ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] o ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0 /* local read inc a */ -s_mov_b32 s[sgpr104], 0x40 // inc -v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s[sgpr104], v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) +s_mov_b32 s97, 0x40 // inc +v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 64 (bpeDS) /* local read inc b */ // inc (dup assign opt.) -v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s[sgpr104], v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) +v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 64 (bpeDS) s_waitcnt lgkmcnt(0) // 4wait for local read v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL v_and_b32 v135, 63, v[vgprSerial] // v135 = v[vgprSerial] % 64 v_lshrrev_b32 v135, 4, v135 // 135 = 135 / 16 v_lshlrev_b32 v135, 3, v135 // v135 = v135 * 8 v_add_u32 v136, v135, 0 -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], 0, s[sgpr104:sgpr105] // set 0 if K_idx >= sizeL -s_and_b32 s[sgpr106], s[sgprLoopCounterL], 7 // get inputs for edge thread -s_sub_u32 s[sgpr106], 8, s[sgpr106] // use shift to fill 0 for outside element -s_lshl_b32 s[sgpr106], s[sgpr106], 4 // use shift to fill 0 for outside element -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+0+0+0+0:vgprValuA_X1_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+0+0+0+2:vgprValuA_X1_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL +s_and_b32 s97, s[sgprLoopCounterL], 7 // get inputs for edge thread +s_sub_u32 s97, 8, s97 // use shift to fill 0 for outside element +s_lshl_b32 s97, s97, 4 // use shift to fill 0 for outside element +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+0+0+0+0:vgprValuA_X1_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+0+0+0+2:vgprValuA_X1_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+0], v[vgprValuA_X1_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+1], v[vgprValuA_X1_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+4+0+0+0:vgprValuA_X1_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+4+0+0+2:vgprValuA_X1_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+2], v[vgprValuA_X1_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+4+0+0+0:vgprValuA_X1_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+4+0+0+2:vgprValuA_X1_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+0], v[vgprValuA_X1_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+1], v[vgprValuA_X1_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+8+0+0+0:vgprValuA_X1_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+8+0+0+2:vgprValuA_X1_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+2], v[vgprValuA_X1_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+8+0+0+0:vgprValuA_X1_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+8+0+0+2:vgprValuA_X1_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+0], v[vgprValuA_X1_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+1], v[vgprValuA_X1_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+12+0+0+0:vgprValuA_X1_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+12+0+0+2:vgprValuA_X1_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+2], v[vgprValuA_X1_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+12+0+0+0:vgprValuA_X1_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+12+0+0+2:vgprValuA_X1_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+0], v[vgprValuA_X1_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+1], v[vgprValuA_X1_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+16+0+0+0:vgprValuA_X1_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+16+0+0+2:vgprValuA_X1_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+2], v[vgprValuA_X1_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+16+0+0+0:vgprValuA_X1_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+16+0+0+2:vgprValuA_X1_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+0], v[vgprValuA_X1_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+1], v[vgprValuA_X1_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+20+0+0+0:vgprValuA_X1_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+20+0+0+2:vgprValuA_X1_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+2], v[vgprValuA_X1_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+20+0+0+0:vgprValuA_X1_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+20+0+0+2:vgprValuA_X1_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+0], v[vgprValuA_X1_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+1], v[vgprValuA_X1_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+24+0+0+0:vgprValuA_X1_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+24+0+0+2:vgprValuA_X1_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+2], v[vgprValuA_X1_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+24+0+0+0:vgprValuA_X1_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+24+0+0+2:vgprValuA_X1_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+0], v[vgprValuA_X1_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+1], v[vgprValuA_X1_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuA_X1_I0+28+0+0+0:vgprValuA_X1_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuA_X1_I0+28+0+0+2:vgprValuA_X1_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+2], v[vgprValuA_X1_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X1_I0+28+0+0+0:vgprValuA_X1_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X1_I0+28+0+0+2:vgprValuA_X1_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+0], v[vgprValuA_X1_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+1], v[vgprValuA_X1_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+0+0+0+0:vgprValuB_X1_I0+0+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+0+0+0+2:vgprValuB_X1_I0+0+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+2], v[vgprValuA_X1_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuA_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+0+0+0+0:vgprValuB_X1_I0+0+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+0+0+0+2:vgprValuB_X1_I0+0+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+0], v[vgprValuB_X1_I0+0+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+1], v[vgprValuB_X1_I0+0+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+4+0+0+0:vgprValuB_X1_I0+4+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+4+0+0+2:vgprValuB_X1_I0+4+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+2], v[vgprValuB_X1_I0+0+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+0+0+0+3], v[vgprValuB_X1_I0+0+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+4+0+0+0:vgprValuB_X1_I0+4+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+4+0+0+2:vgprValuB_X1_I0+4+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+0], v[vgprValuB_X1_I0+4+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+1], v[vgprValuB_X1_I0+4+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+8+0+0+0:vgprValuB_X1_I0+8+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+8+0+0+2:vgprValuB_X1_I0+8+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+2], v[vgprValuB_X1_I0+4+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+4+0+0+3], v[vgprValuB_X1_I0+4+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+8+0+0+0:vgprValuB_X1_I0+8+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+8+0+0+2:vgprValuB_X1_I0+8+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+0], v[vgprValuB_X1_I0+8+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+1], v[vgprValuB_X1_I0+8+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+12+0+0+0:vgprValuB_X1_I0+12+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+12+0+0+2:vgprValuB_X1_I0+12+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+2], v[vgprValuB_X1_I0+8+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+8+0+0+3], v[vgprValuB_X1_I0+8+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+12+0+0+0:vgprValuB_X1_I0+12+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+12+0+0+2:vgprValuB_X1_I0+12+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+0], v[vgprValuB_X1_I0+12+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+1], v[vgprValuB_X1_I0+12+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+16+0+0+0:vgprValuB_X1_I0+16+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+16+0+0+2:vgprValuB_X1_I0+16+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+2], v[vgprValuB_X1_I0+12+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+12+0+0+3], v[vgprValuB_X1_I0+12+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+16+0+0+0:vgprValuB_X1_I0+16+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+16+0+0+2:vgprValuB_X1_I0+16+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+0], v[vgprValuB_X1_I0+16+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+1], v[vgprValuB_X1_I0+16+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+20+0+0+0:vgprValuB_X1_I0+20+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+20+0+0+2:vgprValuB_X1_I0+20+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+2], v[vgprValuB_X1_I0+16+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+16+0+0+3], v[vgprValuB_X1_I0+16+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+20+0+0+0:vgprValuB_X1_I0+20+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+20+0+0+2:vgprValuB_X1_I0+20+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+0], v[vgprValuB_X1_I0+20+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+1], v[vgprValuB_X1_I0+20+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+24+0+0+0:vgprValuB_X1_I0+24+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+24+0+0+2:vgprValuB_X1_I0+24+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+2], v[vgprValuB_X1_I0+20+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+20+0+0+3], v[vgprValuB_X1_I0+20+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+24+0+0+0:vgprValuB_X1_I0+24+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+24+0+0+2:vgprValuB_X1_I0+24+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+0], v[vgprValuB_X1_I0+24+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+1], v[vgprValuB_X1_I0+24+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], v141, s[sgpr104:sgpr105] -v_lshlrev_b64 v[138:139], s[sgpr106], v[vgprValuB_X1_I0+28+0+0+0:vgprValuB_X1_I0+28+0+0+0+1] -v_lshlrev_b64 v[140:141], s[sgpr106], v[vgprValuB_X1_I0+28+0+0+2:vgprValuB_X1_I0+28+0+0+2+1] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+2], v[vgprValuB_X1_I0+24+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+24+0+0+3], v[vgprValuB_X1_I0+24+0+0+3], v141, s[98:99] +v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X1_I0+28+0+0+0:vgprValuB_X1_I0+28+0+0+0+1] +v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X1_I0+28+0+0+2:vgprValuB_X1_I0+28+0+0+2+1] v_add_u32 v136, v135, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], v138, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], v139, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+0], v[vgprValuB_X1_I0+28+0+0+0], v138, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+1], v[vgprValuB_X1_I0+28+0+0+1], v139, s[98:99] v_add_u32 v136, v136, 4 // add part of K -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], v140, s[sgpr104:sgpr105] -v_cmp_ge_i32 s[sgpr104:sgpr105], v136, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], v141, s[sgpr104:sgpr105] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+2], v[vgprValuB_X1_I0+28+0+0+2], v140, s[98:99] +v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL] // check K index >= Size L +v_cndmask_b32 v[vgprValuB_X1_I0+28+0+0+3], v[vgprValuB_X1_I0+28+0+0+3], v141, s[98:99] s_nop 1 v_mfma_f32_16x16x32_f16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0] v_mfma_f32_16x16x32_f16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0] @@ -4438,12 +4389,12 @@ s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x20 // inc counterL s_cmp_le_i32 s[sgprLoopCounterL], 0x0 // counterL<=0 s_cbranch_scc0 label_TailLoopBeginL // restart LoopL label_TailLoopEndL: -s_mov_b32 s[sgpr104], 2 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgpr104] // remove lro damage -s_mov_b32 s[sgpr104], 2 // tailloop lds offset -s_mul_i32 s[sgpr104], s[sgprOrigLoopCounter], s[sgpr104] // scale by mul -v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgpr104] // remove lro damage +s_mov_b32 s97, 2 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s97 // remove lro damage +s_mov_b32 s97, 2 // tailloop lds offset +s_mul_i32 s97, s[sgprOrigLoopCounter], s97 // scale by mul +v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s97 // remove lro damage label_SkipTailLoopL: .set vgprValuA_X0_I0_BASE, UNDEF .set vgprValuA_X0_I0, UNDEF @@ -4454,11 +4405,11 @@ label_SkipTailLoopL: label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprLoopCounterL, UNDEF .set sgprOrigLoopCounter, UNDEF -.set sgprStaggerUIter, UNDEF .set sgprSrdA, UNDEF .set sgprSrdB, UNDEF .set sgprShadowLimitA, UNDEF .set sgprShadowLimitB, UNDEF +.set sgprStaggerUIter, UNDEF .set sgprWrapUA, UNDEF .set sgprWrapUB, UNDEF .set sgprGlobalReadIncsA, UNDEF @@ -4466,54 +4417,27 @@ label_Summation_End_DZOUDPYJU2HHRCOQ: .set sgprScalarGlobalReadOffsetA, UNDEF .set sgprScalarGlobalReadOffsetB, UNDEF /* load store sgprs */ -.set sgprAddressScaleAlphaVec, 72 -.set sgprAddressBias, 74 -.set sgprBiasType, 76 -.set sgprBiasStride, 77 -.set sgpractivationAlpha, 78 -.set sgpractivationBeta, 79 -.set sgprActivationType, 80 - -v_readlane_b32 s[sgprSKItersPerWG], v255, 0 -s_nop 0 -v_readlane_b32 s[sgprskGrid], v255, 1 -s_nop 0 -v_readlane_b32 s[sgprMagicNumberProblemNumGroupTiles0], v255, 2 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftProblemNumGroupTiles0], v255, 3 -s_nop 0 -v_readlane_b32 s[sgprMagicShiftItersPerTile], v255, 4 -s_nop 0 -v_readlane_b32 s[sgprMagicNumProblemNumGroupTiles0By1], v255, 5 -s_nop 0 -v_readlane_b32 s[sgprWGM], v255, 6 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress], v255, 7 -s_nop 0 -v_readlane_b32 s[sgprKernArgAddress+1], v255, 8 - -.set sgpr104, UNDEF -.set sgpr105, UNDEF -.set sgpr106, UNDEF -.set sgpr107, UNDEF -.set sgpr108, UNDEF -.set sgpr109, UNDEF -.set sgpr110, UNDEF - +.set sgprAddressScaleAlphaVec, 64 +.set sgprAddressBias, 66 +.set sgprBiasType, 68 +.set sgprBiasStride, 69 +.set sgpractivationAlpha, 70 +.set sgpractivationBeta, 71 +.set sgprActivationType, 72 /* Check if custom structure pointer is null */ s_cmp_eq_u32 s[sgprArgType], 2 // ArgType == 2 ? s_cbranch_scc1 label_LoadExternalEpilogueStruct // branch if ArgType == 2 -s_load_dwordx8 s[72:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 152 // 152 -s_load_dword s80, s[sgprKernArgAddress:sgprKernArgAddress+1], 184 // 184 +s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124 +s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156 s_branch label_LoadExternalEpilogueStructEnd label_LoadExternalEpilogueStruct: -s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 208 // 208 -s_load_dwordx2 s[76:77], s[sgprKernArgAddress:sgprKernArgAddress+1], 224 // 224 -s_load_dwordx2 s[78:79], s[sgprKernArgAddress:sgprKernArgAddress+1], 248 // 248 -s_load_dword s80, s[sgprKernArgAddress:sgprKernArgAddress+1], 256 // 256 +s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180 +s_load_dwordx2 s[68:69], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196 +s_load_dwordx2 s[70:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220 +s_load_dword s72, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228 label_LoadExternalEpilogueStructEnd: -.set sgprSrdScaleAlphaVec, 84 -.set sgprSrdBias, 88 +.set sgprSrdScaleAlphaVec, 76 +.set sgprSrdBias, 80 /* Mapping of Acc register -> C Vgpr register */ @@ -4628,34 +4552,44 @@ label_Load_Bias_End: .set sgprSrdScaleAlphaVec, UNDEF s_cmp_eq_u32 s[sgprStreamKLocalStart], 0 // does wg start tile? s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23 // Only branch on scc0 -s_getpc_b64 s[92:93] // addr of next instr -s_add_i32 s94, label_SK_Partials, 4 // target branch offset -s_add_u32 s92, s92, s94 // add target branch offset -s_addc_u32 s93, s93, 0 // add high and carry -s_setpc_b64 s[92:93] // branch to label_SK_Partials +s_getpc_b64 s[84:85] // addr of next instr +s_add_i32 s86, label_SK_Partials, 4 // target branch offset +s_add_u32 s84, s84, s86 // add target branch offset +s_addc_u32 s85, s85, 0 // add high and carry +s_setpc_b64 s[84:85] // branch to label_SK_Partials label_NoBranch_QWMA7J3AUDGL0X23: s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile? s_cbranch_scc1 label_SK_Store // Branch if started and finished tile, go to regular store code -s_add_u32 s67, s[sgprStreamKIdx], 1 // input partial tile index -s_mul_hi_u32 s82, s[sgprStreamKIterEnd], s[sgprMagicNumberItersPerTile] // s_magic mul, div alg 2 -s_lshr_b32 s83, s[sgprMagicShiftItersPerTile], 31 // tmpS = extract abit -s_mul_i32 s81, s[sgprStreamKIterEnd], s83 // s_magic mul, div alg 2 -s_add_u32 s81, s81, s82 -s_and_b32 s83, s[sgprMagicShiftItersPerTile], 2147483647 // tmpS = remove abit to final shift -s_lshr_b32 s81, s81, s83 // sMagicDiv Alg 2 -s_mul_i32 s81, s81, s[sgprItersPerTile] // start iteration of partial tile -s_sub_u32 s85, s[sgprStreamKIterEnd], s81 // calc iterations completed by this WG +s_add_u32 s77, s[sgprStreamKIdx], 1 // input partial tile index +v_cvt_f32_u32 v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_rcp_iflag_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_cvt_f32_u32 v18, s[sgprStreamKIterEnd] // StreamKIterEnd // ItersPerTile +v_mul_f32 v17, v17, v18 // StreamKIterEnd // ItersPerTile +v_cvt_u32_f32 v17, v17 // StreamKIterEnd // ItersPerTile +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // StreamKIterEnd // ItersPerTile +v_cmpx_eq_u32 exec, v18, s[sgprItersPerTile] // StreamKIterEnd // ItersPerTile +v_add_u32 v17, 1, v17 // StreamKIterEnd // ItersPerTile +v_mov_b32 v18, 0 // StreamKIterEnd // ItersPerTile +s_mov_b64 exec, -1 // Reset exec +v_cmpx_gt_u32 exec, v18, s[sgprItersPerTile] // overflow happened in remainder +v_sub_u32 v17, v17, 1 // quotient - 1 +v_mul_u32_u24 v18, v17, s[sgprItersPerTile] // re-calculate remainder +v_sub_u32 v18, s[sgprStreamKIterEnd], v18 // re-calculate remainder +s_mov_b64 exec, -1 // Reset exec +v_readfirstlane_b32 s73, v17 // quotient +v_readfirstlane_b32 s78, v18 // remainder label_SK_Fixup: -s_lshl_b32 s81, s67, 2 // flag offset based on CTA index -s_load_dword s83, s[sgprAddressFlags:sgprAddressFlags+1], s81 glc // get flag +s_lshl_b32 s73, s77, 2 // flag offset based on CTA index +s_load_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // get flag s_waitcnt lgkmcnt(0) // wait for flag load -s_cmp_eq_u32 s83, 1 // check if ready +s_cmp_eq_u32 s75, 1 // check if ready s_cbranch_scc0 label_SK_Fixup // if flag not set, wait and check again s_barrier // wait for all workgroups before resetting flag -v_readfirstlane_b32 s83, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s83, 0 // Check for wave 0 +v_readfirstlane_b32 s75, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s75, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagReset // Skip flag reset -s_store_dword s83, s[sgprAddressFlags:sgprAddressFlags+1], s81 glc // reset flag +s_store_dword s75, s[sgprAddressFlags:sgprAddressFlags+1], s73 glc // reset flag label_SK_SkipFlagReset: label_Fixup_E0: @@ -4664,8 +4598,8 @@ s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // s_mov_b32 s[sgprSrdWS+2], BufferOOB s_mov_b32 s[sgprSrdWS+3], Srd127_96 // Set bits 127_96 in post-loop SRD -s_mul_i32 s82, 0x40000, s67 // Offset to correct partials tile -s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s82 // add lo to SRD +s_mul_i32 s74, 0x40000, s77 // Offset to correct partials tile +s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s74 // add lo to SRD s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ @@ -4676,42 +4610,42 @@ s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0 // add hi to SRD /* calc coords, apply mask, and issue loads (if necessary) */ v_lshlrev_b32 v18, 5, v[vgprSerial] // v18 = v[vgprSerial] * 32 -s_mov_b32 s82, 0 // Init sgpr offset -buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_mov_b32 s74, 0 // Init sgpr offset +buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -5037,42 +4971,42 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -5398,30 +5332,30 @@ s_nop 0 // 1 wait state required when /******************************************/ /* calc coords, apply mask, and issue loads (if necessary) */ -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS -s_add_u32 s82, s82, 8192 // Inc sgpr offset -buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:0 // load WS -buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s82 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[120:123], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[124:127], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS +s_add_u32 s74, s74, 8192 // Inc sgpr offset +buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:0 // load WS +buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s74 offen offset:16 // load WS v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -5635,42 +5569,42 @@ v_accvgpr_write_b32 acc251, v[vgprValuC+86] // copy vreg[254] to acc v_accvgpr_write_b32 acc255, v[vgprValuC+87] // copy vreg[255] to acc s_nop 1 // 2 wait states required before reading vgpr s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_add_u32 s81, s[sgprSKItersPerWG], 1 // Add extra iter -s_cmp_lt_u32 s67, s[sgprskExtraIters] // Check if next WG had an extra iteration -s_cselect_b32 s81, s81, s[sgprSKItersPerWG] // Select correct number of iterations for next WG -s_add_u32 s85, s85, s81 // next partial tile iteration -s_add_u32 s67, s67, 1 // next partial tile index -s_cmp_lt_u32 s85, s[sgprItersPerTile] // done loading partial tiles? +s_add_u32 s73, s[sgprSKItersPerWG], 1 // Add extra iter +s_cmp_lt_u32 s77, s[sgprskExtraIters] // Check if next WG had an extra iteration +s_cselect_b32 s73, s73, s[sgprSKItersPerWG] // Select correct number of iterations for next WG +s_add_u32 s78, s78, s73 // next partial tile iteration +s_add_u32 s77, s77, 1 // next partial tile index +s_cmp_lt_u32 s78, s[sgprItersPerTile] // done loading partial tiles? s_cbranch_scc1 label_SK_Fixup // Branch to continue fixup loop label_SK_Store: s_cmpk_eq_u32 s[sgprBeta], 0 // Beta == 0 s_cbranch_scc0 label_GW_Beta // Branch if Beta is not zero -s_and_b32 s82, 255, s[sgprSizeI] // s82 = s[sgprSizeI] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s83 // wg0 >= nwg0-1 ? -s_cselect_b32 s82, s82, 0 // set rMT0 -s_cmpk_gt_u32 s82, 0 // rMT0 > 0 +s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? +s_cselect_b32 s74, s74, 0 // set rMT0 +s_cmpk_gt_u32 s74, 0 // rMT0 > 0 s_cbranch_scc0 label_NoBranch_0MXDW6EW9K7ZNG8F // Only branch on scc1 // jump if edges required -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_GW_B0_E1_M, 4 // target branch offset -s_add_u32 s82, s82, s84 // add target branch offset -s_addc_u32 s83, s83, 0 // add high and carry -s_setpc_b64 s[82:83] // branch to label_GW_B0_E1_M +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_GW_B0_E1_M, 4 // target branch offset +s_add_u32 s74, s74, s76 // add target branch offset +s_addc_u32 s75, s75, 0 // add high and carry +s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_M label_NoBranch_0MXDW6EW9K7ZNG8F: -s_and_b32 s82, 255, s[sgprSizeJ] // s82 = s[sgprSizeJ] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s83 // wg1 >= nwg1-1 -s_cselect_b32 s82, s82, 0 // set rMT1 -s_cmpk_gt_u32 s82, 0 // rMT1 > 0 +s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 +s_cselect_b32 s74, s74, 0 // set rMT1 +s_cmpk_gt_u32 s74, 0 // rMT1 > 0 s_cbranch_scc0 label_NoBranch_IXPKU979JKZCQDH3 // Only branch on scc1 // jump if edges required -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_GW_B0_E1_N, 4 // target branch offset -s_add_u32 s82, s82, s84 // add target branch offset -s_addc_u32 s83, s83, 0 // add high and carry -s_setpc_b64 s[82:83] // branch to label_GW_B0_E1_N +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_GW_B0_E1_N, 4 // target branch offset +s_add_u32 s74, s74, s76 // add target branch offset +s_addc_u32 s75, s75, 0 // add high and carry +s_setpc_b64 s[74:75] // branch to label_GW_B0_E1_N label_NoBranch_IXPKU979JKZCQDH3: label_GW_B0_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -5679,28 +5613,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_0 // Branch if true label_To_Activation_None_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Gelu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Relu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_To_Activation_Silu_VW8_beta_0_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_5 +label_To_Activation_Clamp_VW8_beta_0_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_5 label_ActivationSetPCAddrEnd_5: @@ -5715,8 +5657,8 @@ label_ActivationSetPCAddrEnd_5: /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v4, s74 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -5842,7 +5784,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -5868,7 +5810,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -5885,8 +5827,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5897,7 +5839,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -5914,8 +5856,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5926,7 +5868,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -5943,8 +5885,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5955,7 +5897,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -5972,8 +5914,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -5984,7 +5926,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6001,8 +5943,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6013,7 +5955,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6030,8 +5972,8 @@ v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6042,7 +5984,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6059,8 +6001,8 @@ v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6194,7 +6136,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6211,8 +6153,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6223,7 +6165,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6240,8 +6182,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6252,7 +6194,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6269,8 +6211,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6281,7 +6223,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -6298,8 +6240,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6310,7 +6252,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -6327,8 +6269,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6339,7 +6281,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6356,8 +6298,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6368,7 +6310,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6385,8 +6327,8 @@ v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6397,7 +6339,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6414,8 +6356,8 @@ v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6549,7 +6491,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6566,8 +6508,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6578,7 +6520,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6595,8 +6537,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6607,7 +6549,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6624,8 +6566,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6636,7 +6578,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -6653,8 +6595,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6665,7 +6607,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -6682,8 +6624,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6694,7 +6636,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -6711,8 +6653,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6723,7 +6665,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -6740,8 +6682,8 @@ v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6752,7 +6694,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -6769,8 +6711,8 @@ v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6904,7 +6846,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -6921,8 +6863,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6933,7 +6875,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -6950,8 +6892,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6962,7 +6904,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -6979,8 +6921,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -6991,7 +6933,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7008,8 +6950,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -7020,7 +6962,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -7037,8 +6979,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -7049,7 +6991,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -7066,8 +7008,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -7078,7 +7020,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -7095,8 +7037,8 @@ v_pack_b32_f16 v74, v[vgprValuC+76], v[vgprValuC+77] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+78], v[vgprValuC+78] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+79], v[vgprValuC+79] // convert C to fp16 v_pack_b32_f16 v75, v[vgprValuC+78], v[vgprValuC+79] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0) @@ -7107,7 +7049,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -7124,8 +7066,8 @@ v_pack_b32_f16 v82, v[vgprValuC+84], v[vgprValuC+85] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+86], v[vgprValuC+86] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+87], v[vgprValuC+87] // convert C to fp16 v_pack_b32_f16 v83, v[vgprValuC+86], v[vgprValuC+87] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7137,28 +7079,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_1 // Branch if true label_To_Activation_None_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Gelu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Relu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_To_Activation_Silu_VW8_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_4 +label_To_Activation_Clamp_VW8_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_4 label_ActivationSetPCAddrEnd_4: @@ -7174,11 +7124,11 @@ label_ActivationSetPCAddrEnd_4: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -7187,105 +7137,105 @@ ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v4, s74 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -7395,7 +7345,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -7421,7 +7371,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -7447,7 +7397,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -7473,7 +7423,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7499,7 +7449,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -7525,7 +7475,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -7551,7 +7501,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -7577,7 +7527,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -7611,116 +7561,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v4, s74 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc1 // copy acc to vreg[64] v_accvgpr_read_b32 v[vgprValuC+25], acc5 // copy acc to vreg[65] v_accvgpr_read_b32 v[vgprValuC+26], acc9 // copy acc to vreg[66] @@ -7830,7 +7780,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -7856,7 +7806,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -7882,7 +7832,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -7908,7 +7858,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -7934,7 +7884,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -7960,7 +7910,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -7986,7 +7936,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -8012,7 +7962,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -8046,116 +7996,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v4, s74 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc2 // copy acc to vreg[128] v_accvgpr_read_b32 v[vgprValuC+25], acc6 // copy acc to vreg[129] v_accvgpr_read_b32 v[vgprValuC+26], acc10 // copy acc to vreg[130] @@ -8265,7 +8215,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -8291,7 +8241,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -8317,7 +8267,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -8343,7 +8293,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -8369,7 +8319,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -8395,7 +8345,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -8421,7 +8371,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -8447,7 +8397,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -8481,116 +8431,116 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[88:91], v18 offset:0 // load Bias ds_read_b128 v[92:95], v18 offset:16 // load Bias ds_read_b128 v[96:99], v18 offset:1024 // load scaleAlpha ds_read_b128 v[100:103], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v20, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v20, v4, s74 v_lshlrev_b32 v20, 0x2, v20 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v22, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v22, v4, s74 v_lshlrev_b32 v22, 0x2, v22 // Bias address scaled by BPE v_add_lshl_u32 v21, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v21, v16, v21, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v21, v16, v21, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v23, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v23, v16, v23, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v23, v16, v23, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v4, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v4, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -8700,7 +8650,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -8726,7 +8676,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -8752,7 +8702,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -8778,7 +8728,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -8804,7 +8754,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -8830,7 +8780,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -8856,7 +8806,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[72:73], v[8:9] v_mov_b64 v[74:75], v[10:11] v_mov_b64 v[76:77], v[12:13] @@ -8882,7 +8832,7 @@ v_pk_add_f32 v[8:9], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias v_pk_add_f32 v[10:11], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias v_pk_add_f32 v[12:13], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias v_pk_add_f32 v[14:15], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[80:81], v[8:9] v_mov_b64 v[82:83], v[10:11] v_mov_b64 v[84:85], v[12:13] @@ -8909,28 +8859,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true label_To_Activation_None_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Gelu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Relu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_To_Activation_Silu_VW1_beta_0_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_3 +label_To_Activation_Clamp_VW1_beta_0_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_3 label_ActivationSetPCAddrEnd_3: @@ -8946,482 +8904,482 @@ label_ActivationSetPCAddrEnd_3: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v4, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -9496,265 +9454,265 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v62, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v66, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v70, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v74, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v78, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v86, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v90, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v62, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v66, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v70, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v74, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v78, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v86, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v90, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v62, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v66, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v70, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v74, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v78, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v86, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v90, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v62, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v66, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v70, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v74, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v78, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v86, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v90, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v62, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v66, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v70, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v74, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v78, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v86, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 buffer_store_short v55, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v90, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 buffer_store_short v56, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v62, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 buffer_store_short v57, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v66, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 buffer_store_short v58, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v70, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 buffer_store_short v59, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v74, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 buffer_store_short v60, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -9770,480 +9728,480 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v4, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v4, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v4, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v4, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc176 // copy acc to vreg[44] v_accvgpr_read_b32 v[vgprValuC+18], acc180 // copy acc to vreg[45] v_accvgpr_read_b32 v[vgprValuC+19], acc184 // copy acc to vreg[46] @@ -10318,265 +10276,265 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v62, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v66, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v70, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v74, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v78, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v86, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v90, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v62, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v66, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v70, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v74, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v78, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v86, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v90, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v62, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v66, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v70, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v74, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v78, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v86, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v90, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v62, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v66, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v70, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v74, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v78, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v86, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v90, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v62, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v66, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v70, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v74, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v78, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v86, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 buffer_store_short v55, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v90, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 buffer_store_short v56, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v62, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 buffer_store_short v57, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v66, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 buffer_store_short v58, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v70, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 buffer_store_short v59, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v74, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 buffer_store_short v60, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -10596,480 +10554,480 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v4, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc97 // copy acc to vreg[88] v_accvgpr_read_b32 v[vgprValuC+18], acc101 // copy acc to vreg[89] v_accvgpr_read_b32 v[vgprValuC+19], acc105 // copy acc to vreg[90] @@ -11144,265 +11102,265 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v62, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v66, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v70, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v74, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v78, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v86, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v90, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v62, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v66, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v70, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v74, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v78, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v86, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v90, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v62, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v66, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v70, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v74, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v78, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v86, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v90, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v62, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v66, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v70, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v74, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v78, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v86, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v90, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v62, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v66, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v70, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v74, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v78, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v86, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 buffer_store_short v55, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v90, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 buffer_store_short v56, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v62, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 buffer_store_short v57, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v66, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 buffer_store_short v58, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v70, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 buffer_store_short v59, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v74, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 buffer_store_short v60, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -11418,480 +11376,480 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v4, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v4, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v8, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v4, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v4, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v4, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v8, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc18 // copy acc to vreg[132] v_accvgpr_read_b32 v[vgprValuC+18], acc22 // copy acc to vreg[133] v_accvgpr_read_b32 v[vgprValuC+19], acc26 // copy acc to vreg[134] @@ -11966,265 +11924,265 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v62, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v66, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v70, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v74, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v78, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v86, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v90, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v62, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v66, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v70, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v74, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v78, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v86, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v90, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v62, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v66, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v70, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v74, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v78, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v86, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v90, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v62, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v66, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v70, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v74, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v78, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v86, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v90, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v62, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v66, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v70, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v74, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v78, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v86, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 buffer_store_short v55, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v90, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 buffer_store_short v56, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v62, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 buffer_store_short v57, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v66, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 buffer_store_short v58, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v70, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 buffer_store_short v59, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v74, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 buffer_store_short v60, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -12244,480 +12202,480 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v4, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v8, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE ds_read_b32 v85, v88 offset:0 // load Bias ds_read_b32 v86, v88 offset:1024 // load scaleAlpha v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE ds_read_b32 v89, v92 offset:0 // load Bias ds_read_b32 v90, v92 offset:1024 // load scaleAlpha v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v4, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v149, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v149, v8, s74 v_lshlrev_b32 v149, 0x2, v149 // Bias address scaled by BPE v_add_lshl_u32 v148, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v148, v16, v148, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v148, v16, v148, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v153, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v153, v8, s74 v_lshlrev_b32 v153, 0x2, v153 // Bias address scaled by BPE v_add_lshl_u32 v152, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v152, v16, v152, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v152, v16, v152, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v155, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v155, v8, s74 v_lshlrev_b32 v155, 0x2, v155 // Bias address scaled by BPE v_add_lshl_u32 v154, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v154, v16, v154, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v154, v16, v154, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v159, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v159, v8, s74 v_lshlrev_b32 v159, 0x2, v159 // Bias address scaled by BPE v_add_lshl_u32 v158, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v158, v16, v158, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v158, v16, v158, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v161, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v161, v4, s74 v_lshlrev_b32 v161, 0x2, v161 // Bias address scaled by BPE v_add_lshl_u32 v160, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v160, v16, v160, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v160, v16, v160, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v165, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v165, v8, s74 v_lshlrev_b32 v165, 0x2, v165 // Bias address scaled by BPE v_add_lshl_u32 v164, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v164, v16, v164, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v164, v16, v164, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v167, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v167, v8, s74 v_lshlrev_b32 v167, 0x2, v167 // Bias address scaled by BPE v_add_lshl_u32 v166, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v166, v16, v166, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v166, v16, v166, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc194 // copy acc to vreg[176] v_accvgpr_read_b32 v[vgprValuC+18], acc198 // copy acc to vreg[177] v_accvgpr_read_b32 v[vgprValuC+19], acc202 // copy acc to vreg[178] @@ -12792,265 +12750,265 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v62, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v66, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v70, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v74, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v78, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v86, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v90, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v62, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v66, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v70, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v74, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v78, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v86, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v90, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v62, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v66, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v70, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v74, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v78, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v86, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v90, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v62, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v66, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v70, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v74, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v78, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v86, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v90, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v62, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v66, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v70, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v74, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v78, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+55], v86, v[vgprValuC+55] // *= ScaleAlphaVecVMul v_add_f32 v8, v85, v[vgprValuC+55] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v55, v8 v_cvt_f16_f32 v55, v[vgprValuC+55] // convert C to fp16 buffer_store_short v55, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+56], v90, v[vgprValuC+56] // *= ScaleAlphaVecVMul v_add_f32 v8, v89, v[vgprValuC+56] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v56, v8 v_cvt_f16_f32 v56, v[vgprValuC+56] // convert C to fp16 buffer_store_short v56, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+57], v62, v[vgprValuC+57] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+57] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v57, v8 v_cvt_f16_f32 v57, v[vgprValuC+57] // convert C to fp16 buffer_store_short v57, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+58], v66, v[vgprValuC+58] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+58] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v58, v8 v_cvt_f16_f32 v58, v[vgprValuC+58] // convert C to fp16 buffer_store_short v58, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+59], v70, v[vgprValuC+59] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+59] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v59, v8 v_cvt_f16_f32 v59, v[vgprValuC+59] // convert C to fp16 buffer_store_short v59, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+60], v74, v[vgprValuC+60] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+60] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v60, v8 v_cvt_f16_f32 v60, v[vgprValuC+60] // convert C to fp16 buffer_store_short v60, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13066,396 +13024,396 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v56, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v56, v8, s74 v_lshlrev_b32 v56, 0x2, v56 // Bias address scaled by BPE ds_read_b32 v53, v56 offset:0 // load Bias ds_read_b32 v54, v56 offset:1024 // load scaleAlpha v_add_lshl_u32 v55, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v55, v16, v55, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v55, v16, v55, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v60, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v60, v8, s74 v_lshlrev_b32 v60, 0x2, v60 // Bias address scaled by BPE ds_read_b32 v57, v60 offset:0 // load Bias ds_read_b32 v58, v60 offset:1024 // load scaleAlpha v_add_lshl_u32 v59, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v59, v16, v59, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v59, v16, v59, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v68, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v68, v8, s74 v_lshlrev_b32 v68, 0x2, v68 // Bias address scaled by BPE ds_read_b32 v65, v68 offset:0 // load Bias ds_read_b32 v66, v68 offset:1024 // load scaleAlpha v_add_lshl_u32 v67, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v67, v16, v67, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v67, v16, v67, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v72, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v72, v4, s74 v_lshlrev_b32 v72, 0x2, v72 // Bias address scaled by BPE ds_read_b32 v69, v72 offset:0 // load Bias ds_read_b32 v70, v72 offset:1024 // load scaleAlpha v_add_lshl_u32 v71, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v71, v16, v71, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v71, v16, v71, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v76, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v76, v8, s74 v_lshlrev_b32 v76, 0x2, v76 // Bias address scaled by BPE ds_read_b32 v73, v76 offset:0 // load Bias ds_read_b32 v74, v76 offset:1024 // load scaleAlpha v_add_lshl_u32 v75, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v75, v16, v75, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v75, v16, v75, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v80, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v80, v8, s74 v_lshlrev_b32 v80, 0x2, v80 // Bias address scaled by BPE ds_read_b32 v77, v80 offset:0 // load Bias ds_read_b32 v78, v80 offset:1024 // load scaleAlpha v_add_lshl_u32 v79, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v79, v16, v79, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v79, v16, v79, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v86, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v86, v8, s74 v_lshlrev_b32 v86, 0x2, v86 // Bias address scaled by BPE v_add_lshl_u32 v85, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v85, v16, v85, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v85, v16, v85, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v88, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v88, v8, s74 v_lshlrev_b32 v88, 0x2, v88 // Bias address scaled by BPE v_add_lshl_u32 v87, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v87, v16, v87, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v87, v16, v87, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v90, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v90, v8, s74 v_lshlrev_b32 v90, 0x2, v90 // Bias address scaled by BPE v_add_lshl_u32 v89, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v16, v89, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v89, v16, v89, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v8, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v91, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v91, v16, v91, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v91, v16, v91, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v98, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v98, v8, s74 v_lshlrev_b32 v98, 0x2, v98 // Bias address scaled by BPE v_add_lshl_u32 v97, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v97, v16, v97, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v97, v16, v97, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v8, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v103, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v103, v16, v103, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v103, v16, v103, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v110, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v110, v4, s74 v_lshlrev_b32 v110, 0x2, v110 // Bias address scaled by BPE v_add_lshl_u32 v109, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v109, v16, v109, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v109, v16, v109, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v8, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v115, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v115, v16, v115, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v115, v16, v115, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v122, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v122, v8, s74 v_lshlrev_b32 v122, 0x2, v122 // Bias address scaled by BPE v_add_lshl_u32 v121, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v121, v16, v121, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v121, v16, v121, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v4, s82 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v4, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v128, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v128, v8, s74 v_lshlrev_b32 v128, 0x2, v128 // Bias address scaled by BPE v_add_lshl_u32 v127, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v127, v16, v127, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v127, v16, v127, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v137, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v137, v8, s74 v_lshlrev_b32 v137, 0x2, v137 // Bias address scaled by BPE v_add_lshl_u32 v136, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v136, v16, v136, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v136, v16, v136, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v143, v8, s82 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v143, v8, s74 v_lshlrev_b32 v143, 0x2, v143 // Bias address scaled by BPE v_add_lshl_u32 v142, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v142, v16, v142, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v142, v16, v142, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc115 // copy acc to vreg[220] v_accvgpr_read_b32 v[vgprValuC+18], acc119 // copy acc to vreg[221] v_accvgpr_read_b32 v[vgprValuC+19], acc123 // copy acc to vreg[222] @@ -13518,234 +13476,234 @@ s_waitcnt lgkmcnt(0) // wait for Bias LDS, ScaleAl /* apply mask, calc new C and issue writes */ v_mul_f32 v[vgprValuC+17], v54, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_add_f32 v8, v53, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v58, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v62, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v66, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v70, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v71, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v74, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v75, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v78, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v79, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v82, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v54, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_add_f32 v8, v53, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v58, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v87, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v62, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v66, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v91, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v70, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v74, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v78, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v97, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v82, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v54, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_add_f32 v8, v53, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v58, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v103, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v62, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v66, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v70, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v74, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v78, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v82, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v115, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v54, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_add_f32 v8, v53, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v58, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v62, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v121, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v66, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v70, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_add_f32 v8, v69, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v74, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_add_f32 v8, v73, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v127, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v78, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_add_f32 v8, v77, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v82, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_add_f32 v8, v81, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v54, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_add_f32 v8, v53, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v58, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_add_f32 v8, v57, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v62, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_add_f32 v8, v61, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v66, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_add_f32 v8, v65, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_GW_Beta: -s_and_b32 s82, 255, s[sgprSizeI] // s82 = s[sgprSizeI] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups0] -s_cmp_ge_u32 s[sgprWorkGroup0], s83 // wg0 >= nwg0-1 ? -s_cselect_b32 s82, s82, 0 // set rMT0 -s_cmpk_gt_u32 s82, 0 // rMT0 > 0 +s_and_b32 s74, 255, s[sgprSizeI] // s74 = s[sgprSizeI] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups0] +s_cmp_ge_u32 s[sgprWorkGroup0], s75 // wg0 >= nwg0-1 ? +s_cselect_b32 s74, s74, 0 // set rMT0 +s_cmpk_gt_u32 s74, 0 // rMT0 > 0 s_cbranch_scc1 label_GW_B1_E1_M // jump if edges required -s_and_b32 s82, 255, s[sgprSizeJ] // s82 = s[sgprSizeJ] % 256 -s_add_u32 s83, -0x1, s[sgprNumWorkGroups1] -s_cmp_ge_u32 s[sgprWorkGroup1], s83 // wg1 >= nwg1-1 -s_cselect_b32 s82, s82, 0 // set rMT1 -s_cmpk_gt_u32 s82, 0 // rMT1 > 0 +s_and_b32 s74, 255, s[sgprSizeJ] // s74 = s[sgprSizeJ] % 256 +s_add_u32 s75, -0x1, s[sgprNumWorkGroups1] +s_cmp_ge_u32 s[sgprWorkGroup1], s75 // wg1 >= nwg1-1 +s_cselect_b32 s74, s74, 0 // set rMT1 +s_cmpk_gt_u32 s74, 0 // rMT1 > 0 s_cbranch_scc1 label_GW_B1_E1_N // jump if edges required label_GW_B1_E0: s_cmpk_eq_u32 s[sgprActivationType], 3 // activationType == 3 @@ -13754,28 +13712,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_0 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_0 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_0 // Branch if true label_To_Activation_None_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Gelu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Relu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_To_Activation_Silu_VW8_beta_1_edge_0: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_2 +label_To_Activation_Clamp_VW8_beta_1_edge_0: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_2 label_ActivationSetPCAddrEnd_2: @@ -13792,8 +13758,8 @@ label_ActivationSetPCAddrEnd_2: /* (d1,vc1,d0,vc0)=(0,0,0,0) */ v_add_lshl_u32 v18, v6, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v19, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v19, v4, s74 v_lshlrev_b32 v19, 0x2, v19 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -13802,28 +13768,28 @@ ds_read_b128 v[76:79], v19 offset:16 // load Bias ds_read_b128 v[80:83], v19 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,1,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[88:91], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,2,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[92:95], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,3,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,4,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,5,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_add_lshl_u32 v17, v7, v4, 0x1 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4 @@ -13921,7 +13887,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -13957,7 +13923,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -13974,8 +13940,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -13996,7 +13962,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14013,8 +13979,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14035,7 +14001,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -14052,8 +14018,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14074,7 +14040,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -14091,8 +14057,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14113,7 +14079,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14130,8 +14096,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14144,8 +14110,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,6,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[72:75], v19 offset:0 // load Bias @@ -14153,28 +14119,28 @@ ds_read_b128 v[76:79], v19 offset:16 // load Bias ds_read_b128 v[80:83], v19 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,7,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[88:91], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,8,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[92:95], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,9,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,10,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,11,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] @@ -14271,7 +14237,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14288,8 +14254,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14310,7 +14276,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -14327,8 +14293,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14349,7 +14315,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14366,8 +14332,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14388,7 +14354,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -14405,8 +14371,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14427,7 +14393,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -14444,8 +14410,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14466,7 +14432,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14483,8 +14449,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14497,8 +14463,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,12,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[72:75], v19 offset:0 // load Bias @@ -14506,28 +14472,28 @@ ds_read_b128 v[76:79], v19 offset:16 // load Bias ds_read_b128 v[80:83], v19 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,13,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[88:91], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,14,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[92:95], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,15,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,16,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,17,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] @@ -14624,7 +14590,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14641,8 +14607,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14663,7 +14629,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -14680,8 +14646,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14702,7 +14668,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -14719,8 +14685,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14741,7 +14707,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -14758,8 +14724,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14780,7 +14746,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -14797,8 +14763,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -14819,7 +14785,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -14836,8 +14802,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -14850,8 +14816,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,18,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[72:75], v19 offset:0 // load Bias @@ -14859,28 +14825,28 @@ ds_read_b128 v[76:79], v19 offset:16 // load Bias ds_read_b128 v[80:83], v19 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,19,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[88:91], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,20,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[92:95], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,21,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,22,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,23,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] @@ -14977,7 +14943,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -14994,8 +14960,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15016,7 +14982,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15033,8 +14999,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15055,7 +15021,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -15072,8 +15038,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15094,7 +15060,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -15111,8 +15077,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15133,7 +15099,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -15150,8 +15116,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15172,7 +15138,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -15189,8 +15155,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -15203,8 +15169,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,24,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[72:75], v19 offset:0 // load Bias @@ -15212,28 +15178,28 @@ ds_read_b128 v[76:79], v19 offset:16 // load Bias ds_read_b128 v[80:83], v19 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,25,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[88:91], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,26,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[92:95], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,27,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,28,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C /* (d1,vc1,d0,vc0)=(0,29,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] @@ -15330,7 +15296,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15347,8 +15313,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15369,7 +15335,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15386,8 +15352,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15408,7 +15374,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -15425,8 +15391,8 @@ v_pack_b32_f16 v42, v[vgprValuC+44], v[vgprValuC+45] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+46], v[vgprValuC+46] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+47], v[vgprValuC+47] // convert C to fp16 v_pack_b32_f16 v43, v[vgprValuC+46], v[vgprValuC+47] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15447,7 +15413,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -15464,8 +15430,8 @@ v_pack_b32_f16 v50, v[vgprValuC+52], v[vgprValuC+53] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+54], v[vgprValuC+54] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+55], v[vgprValuC+55] // convert C to fp16 v_pack_b32_f16 v51, v[vgprValuC+54], v[vgprValuC+55] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15486,7 +15452,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -15503,8 +15469,8 @@ v_pack_b32_f16 v58, v[vgprValuC+60], v[vgprValuC+61] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+62], v[vgprValuC+62] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+63], v[vgprValuC+63] // convert C to fp16 v_pack_b32_f16 v59, v[vgprValuC+62], v[vgprValuC+63] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15525,7 +15491,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -15542,8 +15508,8 @@ v_pack_b32_f16 v66, v[vgprValuC+68], v[vgprValuC+69] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+70], v[vgprValuC+70] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+71], v[vgprValuC+71] // convert C to fp16 v_pack_b32_f16 v67, v[vgprValuC+70], v[vgprValuC+71] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -15556,8 +15522,8 @@ s_nop 0 // 1 wait state required when /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,30,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C ds_read_b128 v[40:43], v19 offset:0 // load Bias @@ -15565,8 +15531,8 @@ ds_read_b128 v[44:47], v19 offset:16 // load Bias ds_read_b128 v[48:51], v19 offset:1024 // load scaleAlpha ds_read_b128 v[52:55], v19 offset:1040 // load scaleAlpha /* (d1,vc1,d0,vc0)=(0,31,0,0) */ -s_lshl_b32 s82, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideC1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[56:59], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] @@ -15615,7 +15581,7 @@ v_pk_add_f32 v[8:9], v[40:41], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[42:43], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[44:45], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[46:47], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15632,8 +15598,8 @@ v_pack_b32_f16 v26, v[vgprValuC+28], v[vgprValuC+29] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+30], v[vgprValuC+30] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+31], v[vgprValuC+31] // convert C to fp16 v_pack_b32_f16 v27, v[vgprValuC+30], v[vgprValuC+31] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -15654,7 +15620,7 @@ v_pk_add_f32 v[8:9], v[40:41], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[42:43], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[44:45], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[46:47], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15671,8 +15637,8 @@ v_pack_b32_f16 v34, v[vgprValuC+36], v[vgprValuC+37] // Pack with neighbor v_cvt_f16_f32 v[vgprValuC+38], v[vgprValuC+38] // convert C to fp16 v_cvt_f16_f32 v[vgprValuC+39], v[vgprValuC+39] // convert C to fp16 v_pack_b32_f16 v35, v[vgprValuC+38], v[vgprValuC+39] // Pack with neighbor -s_lshl_b32 s82, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s82 // incToNextRow: gra SRD += inc(lower) +s_lshl_b32 s74, s[sgprStrideD1J], 1 // incToNextRow: Scale by BPE +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s74 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -15684,28 +15650,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_1 // Branch if true label_To_Activation_None_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Gelu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Relu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_To_Activation_Silu_VW8_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW8, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd_1 +label_To_Activation_Clamp_VW8_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW8, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd_1 label_ActivationSetPCAddrEnd_1: @@ -15721,14 +15695,14 @@ label_ActivationSetPCAddrEnd_1: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier @@ -15737,92 +15711,92 @@ ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v4, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+25], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+26], acc8 // copy acc to vreg[2] @@ -15916,7 +15890,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -15950,7 +15924,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -15984,7 +15958,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -16018,7 +15992,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16052,7 +16026,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16086,7 +16060,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16120,106 +16094,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v4, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc192 // copy acc to vreg[48] v_accvgpr_read_b32 v[vgprValuC+25], acc196 // copy acc to vreg[49] v_accvgpr_read_b32 v[vgprValuC+26], acc200 // copy acc to vreg[50] @@ -16313,7 +16287,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -16347,7 +16321,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -16381,7 +16355,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -16415,7 +16389,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16449,7 +16423,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16483,7 +16457,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16517,106 +16491,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v4, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc129 // copy acc to vreg[96] v_accvgpr_read_b32 v[vgprValuC+25], acc133 // copy acc to vreg[97] v_accvgpr_read_b32 v[vgprValuC+26], acc137 // copy acc to vreg[98] @@ -16710,7 +16684,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -16744,7 +16718,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -16778,7 +16752,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -16812,7 +16786,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -16846,7 +16820,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -16880,7 +16854,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -16914,106 +16888,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v4, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc66 // copy acc to vreg[144] v_accvgpr_read_b32 v[vgprValuC+25], acc70 // copy acc to vreg[145] v_accvgpr_read_b32 v[vgprValuC+26], acc74 // copy acc to vreg[146] @@ -17107,7 +17081,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -17141,7 +17115,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -17175,7 +17149,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -17209,7 +17183,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -17243,7 +17217,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -17277,7 +17251,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -17311,106 +17285,106 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[72:75], v18 offset:0 // load Bias ds_read_b128 v[76:79], v18 offset:16 // load Bias ds_read_b128 v[80:83], v18 offset:1024 // load scaleAlpha ds_read_b128 v[84:87], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[88:91], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v92, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v92, v4, s74 v_lshlrev_b32 v92, 0x2, v92 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[96:99], v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v4, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE v_add_lshl_u32 v93, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[100:103], v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v104, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v104, v4, s74 v_lshlrev_b32 v104, 0x2, v104 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[108:111], v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v4, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[112:115], v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v116, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v116, v4, s74 v_lshlrev_b32 v116, 0x2, v116 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc3 // copy acc to vreg[192] v_accvgpr_read_b32 v[vgprValuC+25], acc7 // copy acc to vreg[193] v_accvgpr_read_b32 v[vgprValuC+26], acc11 // copy acc to vreg[194] @@ -17504,7 +17478,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -17538,7 +17512,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -17572,7 +17546,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+40:vgprValuC+40+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+42:vgprValuC+42+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+44:vgprValuC+44+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+46:vgprValuC+46+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[40:41], v[8:9] v_mov_b64 v[42:43], v[10:11] v_mov_b64 v[44:45], v[12:13] @@ -17606,7 +17580,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+48:vgprValuC+48+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+50:vgprValuC+50+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+52:vgprValuC+52+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+54:vgprValuC+54+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[48:49], v[8:9] v_mov_b64 v[50:51], v[10:11] v_mov_b64 v[52:53], v[12:13] @@ -17640,7 +17614,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+56:vgprValuC+56+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+58:vgprValuC+58+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+60:vgprValuC+60+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+62:vgprValuC+62+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[56:57], v[8:9] v_mov_b64 v[58:59], v[10:11] v_mov_b64 v[60:61], v[12:13] @@ -17674,7 +17648,7 @@ v_pk_add_f32 v[8:9], v[72:73], v[vgprValuC+64:vgprValuC+64+1] // C += bias v_pk_add_f32 v[10:11], v[74:75], v[vgprValuC+66:vgprValuC+66+1] // C += bias v_pk_add_f32 v[12:13], v[76:77], v[vgprValuC+68:vgprValuC+68+1] // C += bias v_pk_add_f32 v[14:15], v[78:79], v[vgprValuC+70:vgprValuC+70+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[64:65], v[8:9] v_mov_b64 v[66:67], v[10:11] v_mov_b64 v[68:69], v[12:13] @@ -17708,38 +17682,38 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v17, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[20:23], v17, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v18, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v18, v4, s74 v_lshlrev_b32 v18, 0x2, v18 // Bias address scaled by BPE ds_read_b128 v[40:43], v18 offset:0 // load Bias ds_read_b128 v[44:47], v18 offset:16 // load Bias ds_read_b128 v[48:51], v18 offset:1024 // load scaleAlpha ds_read_b128 v[52:55], v18 offset:1040 // load scaleAlpha v_add_lshl_u32 v17, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v17, v16, v17, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v17, v16, v17, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v19, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDC clip if OOB. offset buffer_load_dwordx4 v[56:59], v19, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v60, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v60, v4, s74 v_lshlrev_b32 v60, 0x2, v60 // Bias address scaled by BPE v_add_lshl_u32 v19, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v19, v16, v19, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v19, v16, v19, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+24], acc195 // copy acc to vreg[240] v_accvgpr_read_b32 v[vgprValuC+25], acc199 // copy acc to vreg[241] v_accvgpr_read_b32 v[vgprValuC+26], acc203 // copy acc to vreg[242] @@ -17785,7 +17759,7 @@ v_pk_add_f32 v[8:9], v[40:41], v[vgprValuC+24:vgprValuC+24+1] // C += bias v_pk_add_f32 v[10:11], v[42:43], v[vgprValuC+26:vgprValuC+26+1] // C += bias v_pk_add_f32 v[12:13], v[44:45], v[vgprValuC+28:vgprValuC+28+1] // C += bias v_pk_add_f32 v[14:15], v[46:47], v[vgprValuC+30:vgprValuC+30+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[24:25], v[8:9] v_mov_b64 v[26:27], v[10:11] v_mov_b64 v[28:29], v[12:13] @@ -17819,7 +17793,7 @@ v_pk_add_f32 v[8:9], v[40:41], v[vgprValuC+32:vgprValuC+32+1] // C += bias v_pk_add_f32 v[10:11], v[42:43], v[vgprValuC+34:vgprValuC+34+1] // C += bias v_pk_add_f32 v[12:13], v[44:45], v[vgprValuC+36:vgprValuC+36+1] // C += bias v_pk_add_f32 v[14:15], v[46:47], v[vgprValuC+38:vgprValuC+38+1] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b64 v[32:33], v[8:9] v_mov_b64 v[34:35], v[10:11] v_mov_b64 v[36:37], v[12:13] @@ -17846,28 +17820,36 @@ s_cmpk_eq_u32 s[sgprActivationType], 5 // activationType == 5 s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_1_edge_1 // Branch if true s_cmpk_eq_u32 s[sgprActivationType], 10 // activationType == 10 s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_1_edge_1 // Branch if true +s_cmpk_eq_u32 s[sgprActivationType], 12 // activationType == 12 +s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_1_edge_1 // Branch if true label_To_Activation_None_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_None_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_None_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Gelu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Gelu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Gelu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Relu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Relu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Relu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_To_Activation_Silu_VW1_beta_1_edge_1: s_getpc_b64 s[8:9] // addr of next instr -s_add_i32 s67, label_Activation_Silu_VW1, 4 // target branch offset -s_add_u32 s8, s8, s67 // add target branch offset +s_add_i32 s73, label_Activation_Silu_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset +s_addc_u32 s9, s9, 0 // add high and carry +s_branch label_ActivationSetPCAddrEnd +label_To_Activation_Clamp_VW1_beta_1_edge_1: +s_getpc_b64 s[8:9] // addr of next instr +s_add_i32 s73, label_Activation_Clamp_VW1, 4 // target branch offset +s_add_u32 s8, s8, s73 // add target branch offset s_addc_u32 s9, s9, 0 // add high and carry s_branch label_ActivationSetPCAddrEnd label_ActivationSetPCAddrEnd: @@ -17883,532 +17865,532 @@ label_ActivationSetPCAddrEnd: /* calc coords, apply mask, and issue loads (if necessary) */ v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE s_waitcnt lgkmcnt(0) // Wait for LDS write s_barrier // LDS write barrier ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v8, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v8, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,0,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v8, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v8, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v8, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,1,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v8, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v8, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v8, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,2,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v8, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v8, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v8, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,3,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v8, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v8, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc0 // copy acc to vreg[0] v_accvgpr_read_b32 v[vgprValuC+18], acc4 // copy acc to vreg[1] v_accvgpr_read_b32 v[vgprValuC+19], acc8 // copy acc to vreg[2] @@ -18475,266 +18457,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -18750,534 +18732,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,4,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v8, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,4,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v8, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v8, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,5,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v8, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v8, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v8, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,6,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v8, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v8, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v8, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,7,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v8, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v8, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v8, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,8,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v8, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc152 // copy acc to vreg[38] v_accvgpr_read_b32 v[vgprValuC+18], acc156 // copy acc to vreg[39] v_accvgpr_read_b32 v[vgprValuC+19], acc160 // copy acc to vreg[40] @@ -19344,266 +19326,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -19619,534 +19601,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,9,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v8, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v8, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,9,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v4, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v8, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v8, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v8, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,10,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v4, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v8, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v8, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v8, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,11,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v4, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v8, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v8, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v8, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,12,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v4, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v8, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v8, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v8, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,13,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v4, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc49 // copy acc to vreg[76] v_accvgpr_read_b32 v[vgprValuC+18], acc53 // copy acc to vreg[77] v_accvgpr_read_b32 v[vgprValuC+19], acc57 // copy acc to vreg[78] @@ -20213,266 +20195,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -20488,530 +20470,530 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,14,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v8, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v8, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,14,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v4, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v8, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v8, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v8, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,15,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v4, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v8, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v8, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v8, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,16,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v4, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v8, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v8, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v8, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,17,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v4, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v8, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v8, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v8, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,18,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc201 // copy acc to vreg[114] v_accvgpr_read_b32 v[vgprValuC+18], acc205 // copy acc to vreg[115] v_accvgpr_read_b32 v[vgprValuC+19], acc209 // copy acc to vreg[116] @@ -21078,266 +21060,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -21357,530 +21339,530 @@ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v4, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v8, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v8, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,19,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v4, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v8, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v8, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v8, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,20,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v4, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v8, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v8, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v8, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,21,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v4, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v8, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v8, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v8, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,22,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v4, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v8, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v8, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc98 // copy acc to vreg[152] v_accvgpr_read_b32 v[vgprValuC+18], acc102 // copy acc to vreg[153] v_accvgpr_read_b32 v[vgprValuC+19], acc106 // copy acc to vreg[154] @@ -21947,266 +21929,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -22222,534 +22204,534 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,23,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v8, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,23,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v88, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v88, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v89, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v89, v8, s74 v_lshlrev_b32 v89, 0x2, v89 // Bias address scaled by BPE ds_read_b32 v86, v89 offset:0 // load Bias ds_read_b32 v87, v89 offset:1024 // load scaleAlpha v_add_lshl_u32 v88, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v88, v16, v88, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v88, v16, v88, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v93, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v90, v93, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v94, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v94, v8, s74 v_lshlrev_b32 v94, 0x2, v94 // Bias address scaled by BPE ds_read_b32 v91, v94 offset:0 // load Bias ds_read_b32 v92, v94 offset:1024 // load scaleAlpha v_add_lshl_u32 v93, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v93, v16, v93, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v93, v16, v93, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v96, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v95, v96, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v97, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v97, v8, s74 v_lshlrev_b32 v97, 0x2, v97 // Bias address scaled by BPE v_add_lshl_u32 v96, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v96, v16, v96, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v96, v16, v96, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,24,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v99, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v98, v99, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v100, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v100, v8, s74 v_lshlrev_b32 v100, 0x2, v100 // Bias address scaled by BPE v_add_lshl_u32 v99, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v99, v16, v99, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v99, v16, v99, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v102, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v101, v102, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v103, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v103, v4, s74 v_lshlrev_b32 v103, 0x2, v103 // Bias address scaled by BPE v_add_lshl_u32 v102, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v102, v16, v102, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v102, v16, v102, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v105, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v104, v105, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v106, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v106, v8, s74 v_lshlrev_b32 v106, 0x2, v106 // Bias address scaled by BPE v_add_lshl_u32 v105, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v105, v16, v105, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v105, v16, v105, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v108, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v107, v108, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v109, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v109, v8, s74 v_lshlrev_b32 v109, 0x2, v109 // Bias address scaled by BPE v_add_lshl_u32 v108, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v108, v16, v108, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v108, v16, v108, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v111, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v110, v111, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v112, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v112, v8, s74 v_lshlrev_b32 v112, 0x2, v112 // Bias address scaled by BPE v_add_lshl_u32 v111, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v111, v16, v111, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v111, v16, v111, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v114, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v113, v114, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v115, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v115, v8, s74 v_lshlrev_b32 v115, 0x2, v115 // Bias address scaled by BPE v_add_lshl_u32 v114, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v114, v16, v114, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v114, v16, v114, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v117, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v116, v117, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v118, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v118, v8, s74 v_lshlrev_b32 v118, 0x2, v118 // Bias address scaled by BPE v_add_lshl_u32 v117, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v117, v16, v117, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v117, v16, v117, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v120, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v119, v120, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v121, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v121, v8, s74 v_lshlrev_b32 v121, 0x2, v121 // Bias address scaled by BPE v_add_lshl_u32 v120, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v120, v16, v120, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v120, v16, v120, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,25,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v123, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v122, v123, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v124, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v124, v8, s74 v_lshlrev_b32 v124, 0x2, v124 // Bias address scaled by BPE v_add_lshl_u32 v123, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v123, v16, v123, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v123, v16, v123, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v126, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v125, v126, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v127, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v127, v4, s74 v_lshlrev_b32 v127, 0x2, v127 // Bias address scaled by BPE v_add_lshl_u32 v126, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v126, v16, v126, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v126, v16, v126, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v129, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v128, v129, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v130, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v130, v8, s74 v_lshlrev_b32 v130, 0x2, v130 // Bias address scaled by BPE v_add_lshl_u32 v129, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v129, v16, v129, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v129, v16, v129, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v135, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v131, v135, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v136, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v136, v8, s74 v_lshlrev_b32 v136, 0x2, v136 // Bias address scaled by BPE v_add_lshl_u32 v135, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, v16, v135, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v135, v16, v135, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v138, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v137, v138, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v139, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v139, v8, s74 v_lshlrev_b32 v139, 0x2, v139 // Bias address scaled by BPE v_add_lshl_u32 v138, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v138, v16, v138, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v138, v16, v138, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v141, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v140, v141, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v142, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v142, v8, s74 v_lshlrev_b32 v142, 0x2, v142 // Bias address scaled by BPE v_add_lshl_u32 v141, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v141, v16, v141, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v141, v16, v141, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v144, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v143, v144, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v145, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v145, v8, s74 v_lshlrev_b32 v145, 0x2, v145 // Bias address scaled by BPE v_add_lshl_u32 v144, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v144, v16, v144, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v144, v16, v144, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v147, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v146, v147, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v148, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v148, v8, s74 v_lshlrev_b32 v148, 0x2, v148 // Bias address scaled by BPE v_add_lshl_u32 v147, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v147, v16, v147, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v147, v16, v147, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,26,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v150, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v149, v150, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v151, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v151, v8, s74 v_lshlrev_b32 v151, 0x2, v151 // Bias address scaled by BPE v_add_lshl_u32 v150, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v150, v16, v150, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v150, v16, v150, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v153, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v152, v153, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v154, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v154, v4, s74 v_lshlrev_b32 v154, 0x2, v154 // Bias address scaled by BPE v_add_lshl_u32 v153, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v153, v16, v153, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v153, v16, v153, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v156, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v155, v156, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v157, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v157, v8, s74 v_lshlrev_b32 v157, 0x2, v157 // Bias address scaled by BPE v_add_lshl_u32 v156, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v156, v16, v156, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v156, v16, v156, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v159, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v158, v159, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v160, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v160, v8, s74 v_lshlrev_b32 v160, 0x2, v160 // Bias address scaled by BPE v_add_lshl_u32 v159, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v159, v16, v159, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v159, v16, v159, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v162, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v161, v162, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v163, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v163, v8, s74 v_lshlrev_b32 v163, 0x2, v163 // Bias address scaled by BPE v_add_lshl_u32 v162, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v162, v16, v162, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v162, v16, v162, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v165, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v164, v165, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v166, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v166, v8, s74 v_lshlrev_b32 v166, 0x2, v166 // Bias address scaled by BPE v_add_lshl_u32 v165, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v165, v16, v165, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v165, v16, v165, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v168, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v167, v168, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v169, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v169, v8, s74 v_lshlrev_b32 v169, 0x2, v169 // Bias address scaled by BPE v_add_lshl_u32 v168, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v168, v16, v168, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v168, v16, v168, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v171, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v170, v171, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v172, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v172, v8, s74 v_lshlrev_b32 v172, 0x2, v172 // Bias address scaled by BPE v_add_lshl_u32 v171, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v171, v16, v171, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v171, v16, v171, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,27,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v174, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v173, v174, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v175, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v175, v8, s74 v_lshlrev_b32 v175, 0x2, v175 // Bias address scaled by BPE v_add_lshl_u32 v174, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v174, v16, v174, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v174, v16, v174, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v177, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v176, v177, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v178, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v178, v4, s74 v_lshlrev_b32 v178, 0x2, v178 // Bias address scaled by BPE v_add_lshl_u32 v177, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v177, v16, v177, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v177, v16, v177, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v180, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v179, v180, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v181, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v181, v8, s74 v_lshlrev_b32 v181, 0x2, v181 // Bias address scaled by BPE v_add_lshl_u32 v180, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v180, v16, v180, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v180, v16, v180, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v183, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v182, v183, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v184, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v184, v8, s74 v_lshlrev_b32 v184, 0x2, v184 // Bias address scaled by BPE v_add_lshl_u32 v183, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v183, v16, v183, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v183, v16, v183, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v186, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v185, v186, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v187, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v187, v8, s74 v_lshlrev_b32 v187, 0x2, v187 // Bias address scaled by BPE v_add_lshl_u32 v186, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v186, v16, v186, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v186, v16, v186, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc250 // copy acc to vreg[190] v_accvgpr_read_b32 v[vgprValuC+18], acc254 // copy acc to vreg[191] v_accvgpr_read_b32 v[vgprValuC+19], acc3 // copy acc to vreg[192] @@ -22816,266 +22798,266 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v57, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v55, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v62, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v60, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v67, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v65, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v72, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v70, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v77, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v75, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v82, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v80, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v85, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v92, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v90, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v93, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v57, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v95, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v62, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v98, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v99, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v67, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v101, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v72, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v104, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v77, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v107, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v82, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v110, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v113, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v92, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v116, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v117, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v57, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v119, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v62, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v122, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v123, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v67, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v125, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v72, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v128, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v77, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v131, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v82, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v137, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v140, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v92, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v143, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v57, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v146, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v62, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v149, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v67, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v152, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v72, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v155, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+45], v77, v[vgprValuC+45] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+45], s[sgprBeta], v158, v[vgprValuC+45] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+45] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v45, v8 v_cvt_f16_f32 v45, v[vgprValuC+45] // convert C to fp16 buffer_store_short v45, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+46], v82, v[vgprValuC+46] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+46], s[sgprBeta], v161, v[vgprValuC+46] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+46] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v46, v8 v_cvt_f16_f32 v46, v[vgprValuC+46] // convert C to fp16 buffer_store_short v46, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+47], s[sgprBeta], v164, v[vgprValuC+47] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v86, v[vgprValuC+47] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v47, v8 v_cvt_f16_f32 v47, v[vgprValuC+47] // convert C to fp16 buffer_store_short v47, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+48], v92, v[vgprValuC+48] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+48], s[sgprBeta], v167, v[vgprValuC+48] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v91, v[vgprValuC+48] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v48, v8 v_cvt_f16_f32 v48, v[vgprValuC+48] // convert C to fp16 buffer_store_short v48, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+49], v57, v[vgprValuC+49] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+49], s[sgprBeta], v170, v[vgprValuC+49] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+49] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v49, v8 v_cvt_f16_f32 v49, v[vgprValuC+49] // convert C to fp16 buffer_store_short v49, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+50], v62, v[vgprValuC+50] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+50], s[sgprBeta], v173, v[vgprValuC+50] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+50] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v50, v8 v_cvt_f16_f32 v50, v[vgprValuC+50] // convert C to fp16 buffer_store_short v50, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+51], v67, v[vgprValuC+51] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+51], s[sgprBeta], v176, v[vgprValuC+51] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+51] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v51, v8 v_cvt_f16_f32 v51, v[vgprValuC+51] // convert C to fp16 buffer_store_short v51, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+52], v72, v[vgprValuC+52] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+52], s[sgprBeta], v179, v[vgprValuC+52] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+52] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v52, v8 v_cvt_f16_f32 v52, v[vgprValuC+52] // convert C to fp16 buffer_store_short v52, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+53], v77, v[vgprValuC+53] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+53], s[sgprBeta], v182, v[vgprValuC+53] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+53] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v53, v8 v_cvt_f16_f32 v53, v[vgprValuC+53] // convert C to fp16 buffer_store_short v53, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+54], v82, v[vgprValuC+54] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+54], s[sgprBeta], v185, v[vgprValuC+54] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+54] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v54, v8 v_cvt_f16_f32 v54, v[vgprValuC+54] // convert C to fp16 buffer_store_short v54, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D @@ -23091,396 +23073,396 @@ s_nop 0 // 1 wait state required when v_mov_b32 v16, BufferOOB /* (d1,vc1,d0,vc0)=(0,28,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v48, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v16, v48, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v48, v16, v48, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v45, v48, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v49, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v49, v8, s74 v_lshlrev_b32 v49, 0x2, v49 // Bias address scaled by BPE ds_read_b32 v46, v49 offset:0 // load Bias ds_read_b32 v47, v49 offset:1024 // load scaleAlpha v_add_lshl_u32 v48, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v48, v16, v48, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v48, v16, v48, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v53, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v16, v53, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v53, v16, v53, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v50, v53, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v54, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v54, v8, s74 v_lshlrev_b32 v54, 0x2, v54 // Bias address scaled by BPE ds_read_b32 v51, v54 offset:0 // load Bias ds_read_b32 v52, v54 offset:1024 // load scaleAlpha v_add_lshl_u32 v53, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v53, v16, v53, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v53, v16, v53, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v58, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v55, v58, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v59, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v59, v8, s74 v_lshlrev_b32 v59, 0x2, v59 // Bias address scaled by BPE ds_read_b32 v56, v59 offset:0 // load Bias ds_read_b32 v57, v59 offset:1024 // load scaleAlpha v_add_lshl_u32 v58, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v58, v16, v58, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v58, v16, v58, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,28,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v63, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v60, v63, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v64, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v64, v8, s74 v_lshlrev_b32 v64, 0x2, v64 // Bias address scaled by BPE ds_read_b32 v61, v64 offset:0 // load Bias ds_read_b32 v62, v64 offset:1024 // load scaleAlpha v_add_lshl_u32 v63, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v63, v16, v63, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v63, v16, v63, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v68, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v65, v68, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v69, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v69, v4, s74 v_lshlrev_b32 v69, 0x2, v69 // Bias address scaled by BPE ds_read_b32 v66, v69 offset:0 // load Bias ds_read_b32 v67, v69 offset:1024 // load scaleAlpha v_add_lshl_u32 v68, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v68, v16, v68, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v68, v16, v68, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v73, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v70, v73, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v74, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v74, v8, s74 v_lshlrev_b32 v74, 0x2, v74 // Bias address scaled by BPE ds_read_b32 v71, v74 offset:0 // load Bias ds_read_b32 v72, v74 offset:1024 // load scaleAlpha v_add_lshl_u32 v73, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v73, v16, v73, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v73, v16, v73, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v78, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v75, v78, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v79, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v79, v8, s74 v_lshlrev_b32 v79, 0x2, v79 // Bias address scaled by BPE ds_read_b32 v76, v79 offset:0 // load Bias ds_read_b32 v77, v79 offset:1024 // load scaleAlpha v_add_lshl_u32 v78, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v78, v16, v78, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v78, v16, v78, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v83, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v80, v83, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v84, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v84, v8, s74 v_lshlrev_b32 v84, 0x2, v84 // Bias address scaled by BPE ds_read_b32 v81, v84 offset:0 // load Bias ds_read_b32 v82, v84 offset:1024 // load scaleAlpha v_add_lshl_u32 v83, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v83, v16, v83, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v83, v16, v83, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v86, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v16, v86, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v86, v16, v86, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v85, v86, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v87, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v87, v8, s74 v_lshlrev_b32 v87, 0x2, v87 // Bias address scaled by BPE v_add_lshl_u32 v86, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v86, v16, v86, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v86, v16, v86, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v89, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v16, v89, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v89, v16, v89, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v88, v89, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v90, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v90, v8, s74 v_lshlrev_b32 v90, 0x2, v90 // Bias address scaled by BPE v_add_lshl_u32 v89, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v89, v16, v89, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v89, v16, v89, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v92, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v16, v92, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v92, v16, v92, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v91, v92, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v93, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v93, v8, s74 v_lshlrev_b32 v93, 0x2, v93 // Bias address scaled by BPE v_add_lshl_u32 v92, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v92, v16, v92, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v92, v16, v92, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,29,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v95, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v94, v95, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v96, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v96, v8, s74 v_lshlrev_b32 v96, 0x2, v96 // Bias address scaled by BPE v_add_lshl_u32 v95, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v95, v16, v95, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v95, v16, v95, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v98, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v97, v98, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v99, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v99, v4, s74 v_lshlrev_b32 v99, 0x2, v99 // Bias address scaled by BPE v_add_lshl_u32 v98, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v98, v16, v98, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v98, v16, v98, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v101, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v100, v101, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v102, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v102, v8, s74 v_lshlrev_b32 v102, 0x2, v102 // Bias address scaled by BPE v_add_lshl_u32 v101, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v101, v16, v101, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v101, v16, v101, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v104, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v103, v104, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v105, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v105, v8, s74 v_lshlrev_b32 v105, 0x2, v105 // Bias address scaled by BPE v_add_lshl_u32 v104, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v104, v16, v104, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v104, v16, v104, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v107, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v106, v107, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v108, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v108, v8, s74 v_lshlrev_b32 v108, 0x2, v108 // Bias address scaled by BPE v_add_lshl_u32 v107, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v107, v16, v107, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v107, v16, v107, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v110, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v109, v110, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v111, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v111, v8, s74 v_lshlrev_b32 v111, 0x2, v111 // Bias address scaled by BPE v_add_lshl_u32 v110, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v110, v16, v110, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v110, v16, v110, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v113, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v112, v113, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v114, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v114, v8, s74 v_lshlrev_b32 v114, 0x2, v114 // Bias address scaled by BPE v_add_lshl_u32 v113, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v113, v16, v113, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v113, v16, v113, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v116, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v115, v116, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v117, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v117, v8, s74 v_lshlrev_b32 v117, 0x2, v117 // Bias address scaled by BPE v_add_lshl_u32 v116, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v116, v16, v116, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v116, v16, v116, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,30,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v119, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v118, v119, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v120, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v120, v8, s74 v_lshlrev_b32 v120, 0x2, v120 // Bias address scaled by BPE v_add_lshl_u32 v119, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v119, v16, v119, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v119, v16, v119, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,0) */ v_add_co_u32 v5, vcc, v5, 1 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ v_add_u32 v6, v6, s[sgprStrideC1J] // ROWINC- Move cinRowPtr to next row v_add_u32 v7, v7, s[sgprStrideD1J] // Move coutRowPtrD to next row -v_cmp_lt_u32 s[82:83], v4, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v4, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v122, v6, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v121, v122, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v123, v4, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v123, v4, s74 v_lshlrev_b32 v123, 0x2, v123 // Bias address scaled by BPE v_add_lshl_u32 v122, v7, v4, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v122, v16, v122, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v122, v16, v122, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,1) */ v_add_co_u32 v8, vcc, v4, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v125, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v124, v125, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v126, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v126, v8, s74 v_lshlrev_b32 v126, 0x2, v126 // Bias address scaled by BPE v_add_lshl_u32 v125, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v125, v16, v125, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v125, v16, v125, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,2) */ v_add_co_u32 v8, vcc, v4, 2 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v128, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v127, v128, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v129, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v129, v8, s74 v_lshlrev_b32 v129, 0x2, v129 // Bias address scaled by BPE v_add_lshl_u32 v128, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v128, v16, v128, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v128, v16, v128, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,3) */ v_add_co_u32 v8, vcc, v4, 3 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v131, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v130, v131, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v135, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v135, v8, s74 v_lshlrev_b32 v135, 0x2, v135 // Bias address scaled by BPE v_add_lshl_u32 v131, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v131, v16, v131, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v131, v16, v131, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,4) */ v_add_co_u32 v8, vcc, v4, 4 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v137, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v136, v137, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v138, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v138, v8, s74 v_lshlrev_b32 v138, 0x2, v138 // Bias address scaled by BPE v_add_lshl_u32 v137, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v137, v16, v137, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v137, v16, v137, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,5) */ v_add_co_u32 v8, vcc, v4, 5 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v140, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v139, v140, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v141, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v141, v8, s74 v_lshlrev_b32 v141, 0x2, v141 // Bias address scaled by BPE v_add_lshl_u32 v140, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v140, v16, v140, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v140, v16, v140, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,6) */ v_add_co_u32 v8, vcc, v4, 6 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v143, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16 v142, v143, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v144, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v144, v8, s74 v_lshlrev_b32 v144, 0x2, v144 // Bias address scaled by BPE v_add_lshl_u32 v143, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v143, v16, v143, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v143, v16, v143, s[78:79] // LDD clip if OOB. offset /* (d1,vc1,d0,vc0)=(0,31,0,7) */ v_add_co_u32 v8, vcc, v4, 7 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[82:83], v8, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[86:87], v5, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[86:87], s[82:83], s[86:87] // in0 && in1 +v_cmp_lt_u32 s[74:75], v8, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[78:79], v5, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[78:79], s[74:75], s[78:79] // in0 && in1 v_add_lshl_u32 v146, v6, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDC clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDC clip if OOB. offset buffer_load_short_d16_hi v145, v146, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C -s_mul_i32 s82, 256, s[sgprWorkGroup0] // wgp0 * MT0 -v_sub_u32 v147, v8, s82 +s_mul_i32 s74, 256, s[sgprWorkGroup0] // wgp0 * MT0 +v_sub_u32 v147, v8, s74 v_lshlrev_b32 v147, 0x2, v147 // Bias address scaled by BPE v_add_lshl_u32 v146, v7, v8, 0x1 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v146, v16, v146, s[86:87] // LDD clip if OOB. offset +v_cndmask_b32 v146, v16, v146, s[78:79] // LDD clip if OOB. offset v_accvgpr_read_b32 v[vgprValuC+17], acc147 // copy acc to vreg[228] v_accvgpr_read_b32 v[vgprValuC+18], acc151 // copy acc to vreg[229] v_accvgpr_read_b32 v[vgprValuC+19], acc155 // copy acc to vreg[230] @@ -23532,203 +23514,203 @@ s_waitcnt 0 // wait for Beta, Bias LDS, S v_mul_f32 v[vgprValuC+17], v47, v[vgprValuC+17] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+17], s[sgprBeta], v45, v[vgprValuC+17] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v46, v[vgprValuC+17] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v17, v8 v_cvt_f16_f32 v17, v[vgprValuC+17] // convert C to fp16 buffer_store_short v17, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+18], v52, v[vgprValuC+18] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+18], s[sgprBeta], v50, v[vgprValuC+18] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v51, v[vgprValuC+18] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v18, v8 v_cvt_f16_f32 v18, v[vgprValuC+18] // convert C to fp16 buffer_store_short v18, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+19], s[sgprBeta], v55, v[vgprValuC+19] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+19] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v19, v8 v_cvt_f16_f32 v19, v[vgprValuC+19] // convert C to fp16 buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+20], v62, v[vgprValuC+20] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+20], s[sgprBeta], v60, v[vgprValuC+20] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+20] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v20, v8 v_cvt_f16_f32 v20, v[vgprValuC+20] // convert C to fp16 buffer_store_short v20, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+21], v67, v[vgprValuC+21] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+21], s[sgprBeta], v65, v[vgprValuC+21] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+21] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v21, v8 v_cvt_f16_f32 v21, v[vgprValuC+21] // convert C to fp16 buffer_store_short v21, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+22], v72, v[vgprValuC+22] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+22], s[sgprBeta], v70, v[vgprValuC+22] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+22] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v22, v8 v_cvt_f16_f32 v22, v[vgprValuC+22] // convert C to fp16 buffer_store_short v22, v73, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+23], v77, v[vgprValuC+23] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+23], s[sgprBeta], v75, v[vgprValuC+23] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+23] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v23, v8 v_cvt_f16_f32 v23, v[vgprValuC+23] // convert C to fp16 buffer_store_short v23, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+24], v82, v[vgprValuC+24] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+24], s[sgprBeta], v80, v[vgprValuC+24] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+24] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v24, v8 v_cvt_f16_f32 v24, v[vgprValuC+24] // convert C to fp16 buffer_store_short v24, v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+25], v47, v[vgprValuC+25] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+25], s[sgprBeta], v85, v[vgprValuC+25] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v46, v[vgprValuC+25] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v25, v8 v_cvt_f16_f32 v25, v[vgprValuC+25] // convert C to fp16 buffer_store_short v25, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+26], v52, v[vgprValuC+26] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+26], s[sgprBeta], v88, v[vgprValuC+26] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v51, v[vgprValuC+26] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v26, v8 v_cvt_f16_f32 v26, v[vgprValuC+26] // convert C to fp16 buffer_store_short v26, v89, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+27], s[sgprBeta], v91, v[vgprValuC+27] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+27] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v27, v8 v_cvt_f16_f32 v27, v[vgprValuC+27] // convert C to fp16 buffer_store_short v27, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+28], v62, v[vgprValuC+28] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+28], s[sgprBeta], v94, v[vgprValuC+28] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+28] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v28, v8 v_cvt_f16_f32 v28, v[vgprValuC+28] // convert C to fp16 buffer_store_short v28, v95, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+29], v67, v[vgprValuC+29] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+29], s[sgprBeta], v97, v[vgprValuC+29] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+29] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v29, v8 v_cvt_f16_f32 v29, v[vgprValuC+29] // convert C to fp16 buffer_store_short v29, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+30], v72, v[vgprValuC+30] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+30], s[sgprBeta], v100, v[vgprValuC+30] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+30] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v30, v8 v_cvt_f16_f32 v30, v[vgprValuC+30] // convert C to fp16 buffer_store_short v30, v101, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+31], v77, v[vgprValuC+31] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+31], s[sgprBeta], v103, v[vgprValuC+31] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+31] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v31, v8 v_cvt_f16_f32 v31, v[vgprValuC+31] // convert C to fp16 buffer_store_short v31, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+32], v82, v[vgprValuC+32] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+32], s[sgprBeta], v106, v[vgprValuC+32] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+32] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v32, v8 v_cvt_f16_f32 v32, v[vgprValuC+32] // convert C to fp16 buffer_store_short v32, v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+33], v47, v[vgprValuC+33] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+33], s[sgprBeta], v109, v[vgprValuC+33] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v46, v[vgprValuC+33] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v33, v8 v_cvt_f16_f32 v33, v[vgprValuC+33] // convert C to fp16 buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+34], v52, v[vgprValuC+34] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+34], s[sgprBeta], v112, v[vgprValuC+34] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v51, v[vgprValuC+34] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v34, v8 v_cvt_f16_f32 v34, v[vgprValuC+34] // convert C to fp16 buffer_store_short v34, v113, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+35], s[sgprBeta], v115, v[vgprValuC+35] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+35] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v35, v8 v_cvt_f16_f32 v35, v[vgprValuC+35] // convert C to fp16 buffer_store_short v35, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+36], v62, v[vgprValuC+36] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+36], s[sgprBeta], v118, v[vgprValuC+36] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+36] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v36, v8 v_cvt_f16_f32 v36, v[vgprValuC+36] // convert C to fp16 buffer_store_short v36, v119, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+37], v67, v[vgprValuC+37] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+37], s[sgprBeta], v121, v[vgprValuC+37] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v66, v[vgprValuC+37] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v37, v8 v_cvt_f16_f32 v37, v[vgprValuC+37] // convert C to fp16 buffer_store_short v37, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+38], v72, v[vgprValuC+38] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+38], s[sgprBeta], v124, v[vgprValuC+38] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v71, v[vgprValuC+38] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v38, v8 v_cvt_f16_f32 v38, v[vgprValuC+38] // convert C to fp16 buffer_store_short v38, v125, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+39], v77, v[vgprValuC+39] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+39], s[sgprBeta], v127, v[vgprValuC+39] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v76, v[vgprValuC+39] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v39, v8 v_cvt_f16_f32 v39, v[vgprValuC+39] // convert C to fp16 buffer_store_short v39, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+40], v82, v[vgprValuC+40] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+40], s[sgprBeta], v130, v[vgprValuC+40] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v81, v[vgprValuC+40] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v40, v8 v_cvt_f16_f32 v40, v[vgprValuC+40] // convert C to fp16 buffer_store_short v40, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+41], v47, v[vgprValuC+41] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+41], s[sgprBeta], v136, v[vgprValuC+41] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v46, v[vgprValuC+41] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v41, v8 v_cvt_f16_f32 v41, v[vgprValuC+41] // convert C to fp16 buffer_store_short v41, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+42], v52, v[vgprValuC+42] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+42], s[sgprBeta], v139, v[vgprValuC+42] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v51, v[vgprValuC+42] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v42, v8 v_cvt_f16_f32 v42, v[vgprValuC+42] // convert C to fp16 buffer_store_short v42, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+43], s[sgprBeta], v142, v[vgprValuC+43] op_sel:[0,0,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v56, v[vgprValuC+43] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v43, v8 v_cvt_f16_f32 v43, v[vgprValuC+43] // convert C to fp16 buffer_store_short v43, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D v_mul_f32 v[vgprValuC+44], v62, v[vgprValuC+44] // *= ScaleAlphaVecVMul v_fma_mix_f32 v[vgprValuC+44], s[sgprBeta], v145, v[vgprValuC+44] op_sel:[0,1,0] op_sel_hi:[0,1,0] // //C*=beta v_add_f32 v8, v61, v[vgprValuC+44] // C += bias -s_swappc_b64 s[72:73], s[8:9] +s_swappc_b64 s[64:65], s[8:9] v_mov_b32 v44, v8 v_cvt_f16_f32 v44, v[vgprValuC+44] // convert C to fp16 buffer_store_short v44, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End // jump to end label_Activation_None_VW8: -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Gelu_VW8: v_mul_f32 v16, 0x3d372713, v8 // k1 * x v_fma_f32 v16, v8, v16, 1.0 // 1 + (k1 * x * x) @@ -23826,7 +23808,7 @@ s_nop 0 // 1 wait states v_fma_f32 v16, -2.0, v16, 2.0 // ( + 1 (fused)) v_mul_f32 v16, v15, v16 // x * (1 + tanh(...)) v_mul_f32 v15, 0.5, v16 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Relu_VW8: v_max_f32 v8, v8, 0 // x = max(0, x) v_max_f32 v9, v9, 0 // x = max(0, x) @@ -23836,7 +23818,7 @@ v_max_f32 v12, v12, 0 // x = max(0, x) v_max_f32 v13, v13, 0 // x = max(0, x) v_max_f32 v14, v14, 0 // x = max(0, x) v_max_f32 v15, v15, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Silu_VW8: v_mul_f32 v16, -1.4426950408889634, v8 // (fused -1.442695) v_exp_f32 v16, v16 // exp step 2 @@ -23894,9 +23876,27 @@ v_add_f32 v16, 1.0, v16 // 1 + exp(-x) v_rcp_f32 v16, v16 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v15, v15, v16 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] +label_Activation_Clamp_VW8: +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +v_min_f32 v9, s[sgpractivationBeta], v9 // min(x, beta) +v_max_f32 v9, s[sgpractivationAlpha], v9 // max(alpha, min(x, beta)) +v_min_f32 v10, s[sgpractivationBeta], v10 // min(x, beta) +v_max_f32 v10, s[sgpractivationAlpha], v10 // max(alpha, min(x, beta)) +v_min_f32 v11, s[sgpractivationBeta], v11 // min(x, beta) +v_max_f32 v11, s[sgpractivationAlpha], v11 // max(alpha, min(x, beta)) +v_min_f32 v12, s[sgpractivationBeta], v12 // min(x, beta) +v_max_f32 v12, s[sgpractivationAlpha], v12 // max(alpha, min(x, beta)) +v_min_f32 v13, s[sgpractivationBeta], v13 // min(x, beta) +v_max_f32 v13, s[sgpractivationAlpha], v13 // max(alpha, min(x, beta)) +v_min_f32 v14, s[sgpractivationBeta], v14 // min(x, beta) +v_max_f32 v14, s[sgpractivationAlpha], v14 // max(alpha, min(x, beta)) +v_min_f32 v15, s[sgpractivationBeta], v15 // min(x, beta) +v_max_f32 v15, s[sgpractivationAlpha], v15 // max(alpha, min(x, beta)) +s_setpc_b64 s[64:65] label_Activation_None_VW1: -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Gelu_VW1: v_mul_f32 v16, 0x3d372713, v8 // k1 * x v_fma_f32 v16, v8, v16, 1.0 // 1 + (k1 * x * x) @@ -23910,10 +23910,10 @@ s_nop 0 // 1 wait states v_fma_f32 v16, -2.0, v16, 2.0 // ( + 1 (fused)) v_mul_f32 v16, v8, v16 // x * (1 + tanh(...)) v_mul_f32 v8, 0.5, v16 // 0.5 * x * (1 + tanh(...)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Relu_VW1: v_max_f32 v8, v8, 0 // x = max(0, x) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] label_Activation_Silu_VW1: v_mul_f32 v16, -1.4426950408889634, v8 // (fused -1.442695) v_exp_f32 v16, v16 // exp step 2 @@ -23922,7 +23922,11 @@ v_add_f32 v16, 1.0, v16 // 1 + exp(-x) v_rcp_f32 v16, v16 // 1 / (1 + exp(-x)) s_nop 0 // 1 wait states v_mul_f32 v8, v8, v16 // x / (1 + exp(-x)) -s_setpc_b64 s[72:73] +s_setpc_b64 s[64:65] +label_Activation_Clamp_VW1: +v_min_f32 v8, s[sgpractivationBeta], v8 // min(x, beta) +v_max_f32 v8, s[sgpractivationAlpha], v8 // max(alpha, min(x, beta)) +s_setpc_b64 s[64:65] label_SK_Partials: label_GW_Partials_E0: s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address @@ -24314,23 +24318,23 @@ s_nop 0 // 1 wait state required when s_waitcnt vmcnt(0) // wait for data store s_barrier // store all data before setting flag s_lshl_b32 s8, s[sgprStreamKIdx], 2 // flag offset based on CTA index -v_readfirstlane_b32 s67, v[vgprSerial] // Wave 0 updates flags -s_cmp_eq_u32 s67, 0 // Check for wave 0 +v_readfirstlane_b32 s64, v[vgprSerial] // Wave 0 updates flags +s_cmp_eq_u32 s64, 0 // Check for wave 0 s_cbranch_scc0 label_SK_SkipFlagSet // Skip flag set -s_mov_b32 s67, 1 // flag data -s_store_dword s67, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag +s_mov_b32 s64, 1 // flag data +s_store_dword s64, s[sgprAddressFlags:sgprAddressFlags+1], s8 glc // set flag label_SK_SkipFlagSet: s_waitcnt lgkmcnt(0) // wait for flag s_branch label_GW_End // jump to end label_GW_End: s_cmp_ge_u32 s[sgprStreamKIter], s[sgprStreamKIterEnd] // Check if done all StreamK iterations s_cbranch_scc1 label_NoBranch_Y57Y54XUE2DV604X // Only branch on scc0 -s_getpc_b64 s[82:83] // addr of next instr -s_add_i32 s84, label_PersistentLoopStart, 4 // target branch offset -s_abs_i32 s84, s84 // abs offset -s_sub_u32 s82, s82, s84 // sub target branch offset -s_subb_u32 s83, s83, 0 // sub high and carry -s_setpc_b64 s[82:83] // branch to label_PersistentLoopStart +s_getpc_b64 s[74:75] // addr of next instr +s_add_i32 s76, label_PersistentLoopStart, 4 // target branch offset +s_abs_i32 s76, s76 // abs offset +s_sub_u32 s74, s74, s76 // sub target branch offset +s_subb_u32 s75, s75, 0 // sub high and carry +s_setpc_b64 s[74:75] // branch to label_PersistentLoopStart label_NoBranch_Y57Y54XUE2DV604X: label_KernelEnd: s_endpgm // Kernel End diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index 451621a56334..841db7e28c7c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -4690,22 +4690,15 @@ def readWriteVectors(mat, vw, kernel): if kernel["StreamK"]: # StreamK args - self.defineSgpr("MagicNumberProblemNumGroupTiles0", 1) # Magic number to use for division - self.defineSgpr("MagicShiftProblemNumGroupTiles0", 1) # Magic shift/abit to use for division alg 2 self.defineSgpr("ItersPerTile", 1) - self.defineSgpr("MagicNumberItersPerTile", 1) - self.defineSgpr("MagicShiftItersPerTile", 1) - self.defineSgpr("MagicNumProblemNumGroupTiles0By1", 1) # for PKAB, use for Magic Div Alg 2 by (nwg0*nwg1) - self.defineSgpr("MagicShiftProblemNumGroupTiles0By1", 1) # for PKAB, use for Magic Div Alg 2 by (nwg0*nwg1) self.defineSgpr("TotalIters", 1) self.defineSgpr("SKItersPerWG", 1) - self.states.numSgprStreamK += 9 + self.states.numSgprStreamK += 3 if kernel["StreamK"] >= 2: # Two-tile SK - self.defineSgpr("skGrid", 1) - self.defineSgpr("skTiles", 1) + self.defineSgpr("skGridAndTiles", 1) self.defineSgpr("skExtraIters", 1) # self.defineSgpr("dpTilesPerWG", 1, kernarg=True) - self.states.numSgprStreamK += 3 + self.states.numSgprStreamK += 2 if kernel["LocalWriteUseSgprA"]: self.defineSgpr("LocalWriteAddrA", 1) diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index 001838e3d54c..ab74ba3565ca 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -787,11 +787,6 @@ def isDirectToLdsDoable(state, tc, isaInfoMap, printRejectionReason: bool): reject(state, printRejectionReason, "DirectToLds does not work with LocalReadVectorWidth > MIInputPerThread") return False - if not state["ProblemType"]["Sparse"] and not(state["ProblemType"]["DataType"].is8bitFloat() and (state["MatrixInstK"] == 64 or state["MatrixInstK"] == 128)): - if state["ProblemType"]["DataType"].isBFloat16() and state["AssertSummationElementMultiple"] % (2 * state["GlobalReadVectorWidth%c"%tc]) != 0: - reject(state, printRejectionReason, "can't use DirectToLds for BF16 with AssertSummationElementMultiple %u" % state["AssertSummationElementMultiple"]) - return False - if state["NumThreads"] % state["WavefrontSize"] != 0: reject(state, printRejectionReason, "can't use DirectToLds for NumThreads % WavefrontSize != 0") return False diff --git a/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp b/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp index af725096b27a..666a6fb910c0 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp +++ b/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp @@ -690,14 +690,6 @@ namespace TensileLite args.append("beta_2", inputs.beta, problem.betaType()); } - if(sizeMapping.persistentKernel != 0 || sizeMapping.streamK != 0) - { - uint32_t magicShift; - args.template append("magicNumberProblemNumGroupTiles0", - magicNumber(2, problemNumGroupTiles.x, &magicShift)); - args.template append("magicShiftProblemNumGroupTiles0", magicShift); - } - if(sizeMapping.streamK != 0) { // SK doesn't care gsu @@ -714,25 +706,9 @@ namespace TensileLite // In this case no actual iterations will be run, but workgroups will be mapped correctly for beta*C auto itersPerTile = max(1, problem.getItersPerTile(sizeMapping)); auto totalIters = tiles * itersPerTile; - uint32_t magicNumberItersPerTile; - uint32_t magicShiftItersPerTile; - magicNumberItersPerTile = magicNumber(2, itersPerTile, &magicShiftItersPerTile); - args.template append("itersPerTile", itersPerTile); - args.template append("magicNumberItersPerTile", magicNumberItersPerTile); - args.template append("magicShiftItersPerTile", magicShiftItersPerTile); - - uint32_t numGroupTiles0x1 = problemNumGroupTiles.x * problemNumGroupTiles.y; - uint32_t magicNumProblemNumGroupTiles0By1; - uint32_t magicShiftProblemNumGroupTiles0By1; - magicNumProblemNumGroupTiles0By1 - = magicNumber(2, numGroupTiles0x1, &magicShiftProblemNumGroupTiles0By1); - args.template append("magicNumProblemNumGroupTiles0By1", - magicNumProblemNumGroupTiles0By1); - args.template append("magicShiftProblemNumGroupTiles0By1", - magicShiftProblemNumGroupTiles0By1); - args.template append("totalIters", totalIters); + if(sizeMapping.streamK == 1) // Basic SK { uint32_t itersPerWave = CeilDivide(totalIters, numWorkGroups.x); @@ -766,9 +742,12 @@ namespace TensileLite uint32_t skItersPerWG = skTiles * itersPerTile / skGrid; uint32_t skExtraIters = skTiles * itersPerTile % (skGrid); + // Pack skGrid and skTiles into a single uint32_t such that the upper 16 bits + // represent skGrid and the lower 16 bits represent skTiles + uint32_t skGridAndTiles = (skGrid <<16) | (skTiles & 0xFFFF); + args.template append("SKItersPerWG", skItersPerWG); - args.template append("skGrid", skGrid); - args.template append("skTiles", skTiles); + args.template append("skGridAndTiles", skGridAndTiles); args.template append("skExtraIters", skExtraIters); } } diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/functions/f_math.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/functions/f_math.cpp index 4d9a40fcb872..d430b172d66b 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/functions/f_math.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/functions/f_math.cpp @@ -95,53 +95,26 @@ namespace rocisa template std::shared_ptr scalarStaticRemainder( int, int, int, int, std::optional, const std::string&); // template of scalarUInt32DivideAndRemainder - template std::shared_ptr scalarUInt32DivideAndRemainder( - int, int, int, int, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - std::string, - std::string, - std::string, - int, - ContinuousRegister&, - int, - bool, - const std::string&); - template std::shared_ptr scalarUInt32DivideAndRemainder( - int, int, std::string, int, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr scalarUInt32DivideAndRemainder( - int, std::string, int, int, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - std::string, std::string, int, int, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - int, std::string, std::string, int, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - std::string, - std::string, - int, - std::string, - ContinuousRegister&, - int, - bool, - const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - std::string, - std::string, - std::string, - std::string, - ContinuousRegister&, - int, - bool, - const std::string&); - template std::shared_ptr - scalarUInt32DivideAndRemainder( - std::string, int, int, std::string, ContinuousRegister&, int, bool, const std::string&); - template std::shared_ptr scalarUInt32DivideAndRemainder( - int, int, int, std::string, ContinuousRegister&, int, bool, const std::string&); + #define ExplicitInstantiation(QREG, DREG, DIVREG, RREG) \ + template std::shared_ptr scalarUInt32DivideAndRemainder( \ + QREG, DREG, DIVREG, RREG, ContinuousRegister&, int, bool, const std::string&); + ExplicitInstantiation(std::string, std::string, std::string, std::string) + ExplicitInstantiation(std::string, std::string, std::string, int) + ExplicitInstantiation(std::string, std::string, int, std::string) + ExplicitInstantiation(std::string, std::string, int, int) + ExplicitInstantiation(std::string, int, std::string, std::string) + ExplicitInstantiation(std::string, int, std::string, int) + ExplicitInstantiation(std::string, int, int, std::string) + ExplicitInstantiation(std::string, int, int, int) + ExplicitInstantiation(int, std::string, std::string, std::string) + ExplicitInstantiation(int, std::string, std::string, int) + ExplicitInstantiation(int, std::string, int, std::string) + ExplicitInstantiation(int, std::string, int, int) + ExplicitInstantiation(int, int, std::string, std::string) + ExplicitInstantiation(int, int, std::string, int) + ExplicitInstantiation(int, int, int, std::string) + ExplicitInstantiation(int, int, int, int) + #undef ExplicitInstantiation // template of sMagicDiv template std::shared_ptr sMagicDiv(int dest, bool hasSMulHi, @@ -516,188 +489,42 @@ void math_func(nb::module_ m) nb::arg("divisor"), nb::arg("tmpSgprRes") = std::nullopt, nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def( - "scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa:: - scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, - nb::arg("comment") = ""); - m.def("scalarUInt32DivideAndRemainder", - nb::overload_cast( - &rocisa::scalarUInt32DivideAndRemainder), - nb::arg("qReg"), - nb::arg("dReg"), - nb::arg("divReg"), - nb::arg("rReg"), - nb::arg("tmpVgprRes"), - nb::arg("wavewidth"), - nb::arg("doRemainder") = true, + #define ExplicitInstantiation(QREG, DREG, DIVREG, RREG) \ + m.def("scalarUInt32DivideAndRemainder", \ + nb::overload_cast( \ + &rocisa::scalarUInt32DivideAndRemainder), \ + nb::arg("qReg"), \ + nb::arg("dReg"), \ + nb::arg("divReg"), \ + nb::arg("rReg"), \ + nb::arg("tmpVgprRes"), \ + nb::arg("wavewidth"), \ + nb::arg("doRemainder") = true, \ nb::arg("comment") = ""); + ExplicitInstantiation(std::string, std::string, std::string, std::string) + ExplicitInstantiation(std::string, std::string, std::string, int) + ExplicitInstantiation(std::string, std::string, int, std::string) + ExplicitInstantiation(std::string, std::string, int, int) + ExplicitInstantiation(std::string, int, std::string, std::string) + ExplicitInstantiation(std::string, int, std::string, int) + ExplicitInstantiation(std::string, int, int, std::string) + ExplicitInstantiation(std::string, int, int, int) + ExplicitInstantiation(int, std::string, std::string, std::string) + ExplicitInstantiation(int, std::string, std::string, int) + ExplicitInstantiation(int, std::string, int, std::string) + ExplicitInstantiation(int, std::string, int, int) + ExplicitInstantiation(int, int, std::string, std::string) + ExplicitInstantiation(int, int, std::string, int) + ExplicitInstantiation(int, int, int, std::string) + ExplicitInstantiation(int, int, int, int) + #undef ExplicitInstantiation m.def("sMagicDiv", nb::overload_cast( &rocisa::sMagicDiv),